blob: 838f537e02bf3e23eb12c0a90727c0168d296599 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000315 if (free_list) {
316 unicode = free_list;
317 free_list = *(PyUnicodeObject **)unicode;
318 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Keep-Alive optimization: we only upsize the buffer,
321 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000322 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000323 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000324 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000328 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000335 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 if (unicode == NULL)
338 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 }
342
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000343 if (!unicode->str) {
344 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000345 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000346 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000348 * the caller fails before initializing str -- unicode_resize()
349 * reads str[0], and the Keep-Alive optimization can keep memory
350 * allocated for str alive across a call to unicode_dealloc(unicode).
351 * We don't want unicode_resize to read uninitialized memory in
352 * that case.
353 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000354 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000358 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000359 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000361
362 onError:
363 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366}
367
368static
Guido van Rossum9475a232001-10-05 20:51:39 +0000369void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370{
Walter Dörwald16807132007-05-25 13:52:07 +0000371 switch (PyUnicode_CHECK_INTERNED(unicode)) {
372 case SSTATE_NOT_INTERNED:
373 break;
374
375 case SSTATE_INTERNED_MORTAL:
376 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000377 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
379 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000380 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000381 break;
382
383 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000384 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000385
386 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 }
389
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000394 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 unicode->str = NULL;
396 unicode->length = 0;
397 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000398 if (unicode->defenc) {
399 Py_DECREF(unicode->defenc);
400 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000401 }
402 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000403 *(PyUnicodeObject **)unicode = free_list;
404 free_list = unicode;
405 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406 }
407 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000408 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000409 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000410 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411 }
412}
413
Martin v. Löwis18e16552006-02-15 17:27:45 +0000414int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000439 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 *unicode = (PyObject *)w;
441 return 0;
442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
449/* Internal API for use in unicodeobject.c only ! */
450#define _PyUnicode_Resize(unicodevar, length) \
451 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
452
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455{
456 PyUnicodeObject *unicode;
457
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
462 /* Optimization for empty strings */
463 if (size == 0 && unicode_empty != NULL) {
464 Py_INCREF(unicode_empty);
465 return (PyObject *)unicode_empty;
466 }
467
468 /* Single character Unicode objects in the Latin-1 range are
469 shared when using this constructor */
470 if (size == 1 && *u < 256) {
471 unicode = unicode_latin1[*u];
472 if (!unicode) {
473 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 if (!unicode)
475 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000476 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 unicode_latin1[*u] = unicode;
478 }
479 Py_INCREF(unicode);
480 return (PyObject *)unicode;
481 }
482 }
Tim Petersced69f82003-09-16 20:30:58 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 unicode = _PyUnicode_New(size);
485 if (!unicode)
486 return NULL;
487
488 /* Copy the Unicode data into the new object */
489 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491
492 return (PyObject *)unicode;
493}
494
Walter Dörwaldd2034312007-05-18 16:29:38 +0000495PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000496{
497 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000498
499 if (size < 0) {
500 PyErr_SetString(PyExc_SystemError,
501 "Negative size passed to PyUnicode_FromStringAndSize");
502 return NULL;
503 }
504
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000506 some optimizations which share commonly used objects.
507 Also, this means the input must be UTF-8, so fall back to the
508 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000509 if (u != NULL) {
510
511 /* Optimization for empty strings */
512 if (size == 0 && unicode_empty != NULL) {
513 Py_INCREF(unicode_empty);
514 return (PyObject *)unicode_empty;
515 }
516
Martin v. Löwis9c121062007-08-05 20:26:11 +0000517 /* Single characters are shared when using this constructor.
518 Restrict to ASCII, since the input must be UTF-8. */
519 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000520 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521 if (!unicode) {
522 unicode = _PyUnicode_New(1);
523 if (!unicode)
524 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000525 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000526 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 }
528 Py_INCREF(unicode);
529 return (PyObject *)unicode;
530 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531
532 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000533 }
534
Walter Dörwald55507312007-05-18 13:12:10 +0000535 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 if (!unicode)
537 return NULL;
538
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 return (PyObject *)unicode;
540}
541
Walter Dörwaldd2034312007-05-18 16:29:38 +0000542PyObject *PyUnicode_FromString(const char *u)
543{
544 size_t size = strlen(u);
545 if (size > PY_SSIZE_T_MAX) {
546 PyErr_SetString(PyExc_OverflowError, "input too long");
547 return NULL;
548 }
549
550 return PyUnicode_FromStringAndSize(u, size);
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553#ifdef HAVE_WCHAR_H
554
555PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000556 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000557{
558 PyUnicodeObject *unicode;
559
560 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000561 if (size == 0)
562 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
Martin v. Löwis790465f2008-04-05 20:41:37 +0000567 if (size == -1) {
568 size = wcslen(w);
569 }
570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 unicode = _PyUnicode_New(size);
572 if (!unicode)
573 return NULL;
574
575 /* Copy the wchar_t data into the new object */
576#ifdef HAVE_USABLE_WCHAR_T
577 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000578#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579 {
580 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000583 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 *u++ = *w++;
585 }
586#endif
587
588 return (PyObject *)unicode;
589}
590
Walter Dörwald346737f2007-05-31 10:44:43 +0000591static void
592makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
593{
594 *fmt++ = '%';
595 if (width) {
596 if (zeropad)
597 *fmt++ = '0';
598 fmt += sprintf(fmt, "%d", width);
599 }
600 if (precision)
601 fmt += sprintf(fmt, ".%d", precision);
602 if (longflag)
603 *fmt++ = 'l';
604 else if (size_tflag) {
605 char *f = PY_FORMAT_SIZE_T;
606 while (*f)
607 *fmt++ = *f++;
608 }
609 *fmt++ = c;
610 *fmt = '\0';
611}
612
Walter Dörwaldd2034312007-05-18 16:29:38 +0000613#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
614
615PyObject *
616PyUnicode_FromFormatV(const char *format, va_list vargs)
617{
618 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000619 Py_ssize_t callcount = 0;
620 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000621 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000623 int width = 0;
624 int precision = 0;
625 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 const char* f;
627 Py_UNICODE *s;
628 PyObject *string;
629 /* used by sprintf */
630 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000631 /* use abuffer instead of buffer, if we need more space
632 * (which can happen if there's a format specifier with width). */
633 char *abuffer = NULL;
634 char *realbuffer;
635 Py_ssize_t abuffersize = 0;
636 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000637 const char *copy;
638
639#ifdef VA_LIST_IS_ARRAY
640 Py_MEMCPY(count, vargs, sizeof(va_list));
641#else
642#ifdef __va_copy
643 __va_copy(count, vargs);
644#else
645 count = vargs;
646#endif
647#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000648 /* step 1: count the number of %S/%R/%A format specifications
649 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
650 * these objects once during step 3 and put the result in
651 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000652 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000653 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000654 ++callcount;
655 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000656 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000657 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000659 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000660 if (!callresults) {
661 PyErr_NoMemory();
662 return NULL;
663 }
664 callresult = callresults;
665 }
666 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000667 for (f = format; *f; f++) {
668 if (*f == '%') {
669 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000670 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000671 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000672 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000673 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 ;
675
676 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
677 * they don't affect the amount of space we reserve.
678 */
679 if ((*f == 'l' || *f == 'z') &&
680 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000681 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000682
683 switch (*f) {
684 case 'c':
685 (void)va_arg(count, int);
686 /* fall through... */
687 case '%':
688 n++;
689 break;
690 case 'd': case 'u': case 'i': case 'x':
691 (void) va_arg(count, int);
692 /* 20 bytes is enough to hold a 64-bit
693 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 This isn't enough for octal.
695 If a width is specified we need more
696 (which we allocate later). */
697 if (width < 20)
698 width = 20;
699 n += width;
700 if (abuffersize < width)
701 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 break;
703 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000704 {
705 /* UTF-8 */
706 unsigned char*s;
707 s = va_arg(count, unsigned char*);
708 while (*s) {
709 if (*s < 128) {
710 n++; s++;
711 } else if (*s < 0xc0) {
712 /* invalid UTF-8 */
713 n++; s++;
714 } else if (*s < 0xc0) {
715 n++;
716 s++; if(!*s)break;
717 s++;
718 } else if (*s < 0xe0) {
719 n++;
720 s++; if(!*s)break;
721 s++; if(!*s)break;
722 s++;
723 } else {
724 #ifdef Py_UNICODE_WIDE
725 n++;
726 #else
727 n+=2;
728 #endif
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++; if(!*s)break;
732 s++;
733 }
734 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000735 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000736 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000737 case 'U':
738 {
739 PyObject *obj = va_arg(count, PyObject *);
740 assert(obj && PyUnicode_Check(obj));
741 n += PyUnicode_GET_SIZE(obj);
742 break;
743 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000744 case 'V':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 const char *str = va_arg(count, const char *);
748 assert(obj || str);
749 assert(!obj || PyUnicode_Check(obj));
750 if (obj)
751 n += PyUnicode_GET_SIZE(obj);
752 else
753 n += strlen(str);
754 break;
755 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000756 case 'S':
757 {
758 PyObject *obj = va_arg(count, PyObject *);
759 PyObject *str;
760 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000761 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000762 if (!str)
763 goto fail;
764 n += PyUnicode_GET_SIZE(str);
765 /* Remember the str and switch to the next slot */
766 *callresult++ = str;
767 break;
768 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000769 case 'R':
770 {
771 PyObject *obj = va_arg(count, PyObject *);
772 PyObject *repr;
773 assert(obj);
774 repr = PyObject_Repr(obj);
775 if (!repr)
776 goto fail;
777 n += PyUnicode_GET_SIZE(repr);
778 /* Remember the repr and switch to the next slot */
779 *callresult++ = repr;
780 break;
781 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000782 case 'A':
783 {
784 PyObject *obj = va_arg(count, PyObject *);
785 PyObject *ascii;
786 assert(obj);
787 ascii = PyObject_ASCII(obj);
788 if (!ascii)
789 goto fail;
790 n += PyUnicode_GET_SIZE(ascii);
791 /* Remember the repr and switch to the next slot */
792 *callresult++ = ascii;
793 break;
794 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 case 'p':
796 (void) va_arg(count, int);
797 /* maximum 64-bit pointer representation:
798 * 0xffffffffffffffff
799 * so 19 characters is enough.
800 * XXX I count 18 -- what's the extra for?
801 */
802 n += 19;
803 break;
804 default:
805 /* if we stumble upon an unknown
806 formatting code, copy the rest of
807 the format string to the output
808 string. (we cannot just skip the
809 code, since there's no way to know
810 what's in the argument list) */
811 n += strlen(p);
812 goto expand;
813 }
814 } else
815 n++;
816 }
817 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000818 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000819 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 if (!abuffer) {
821 PyErr_NoMemory();
822 goto fail;
823 }
824 realbuffer = abuffer;
825 }
826 else
827 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000828 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000829 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000830 we don't have to resize the string.
831 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000832 string = PyUnicode_FromUnicode(NULL, n);
833 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000834 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000835
836 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838
839 for (f = format; *f; f++) {
840 if (*f == '%') {
841 const char* p = f++;
842 int longflag = 0;
843 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000844 zeropad = (*f == '0');
845 /* parse the width.precision part */
846 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000847 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000848 width = (width*10) + *f++ - '0';
849 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000850 if (*f == '.') {
851 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000852 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000854 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855 /* handle the long flag, but only for %ld and %lu.
856 others can be added when necessary. */
857 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
858 longflag = 1;
859 ++f;
860 }
861 /* handle the size_t flag. */
862 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
863 size_tflag = 1;
864 ++f;
865 }
866
867 switch (*f) {
868 case 'c':
869 *s++ = va_arg(vargs, int);
870 break;
871 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000873 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000876 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000877 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 sprintf(realbuffer, fmt, va_arg(vargs, int));
879 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000880 break;
881 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000882 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000883 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000884 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000885 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000886 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000888 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
889 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000890 break;
891 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000892 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
893 sprintf(realbuffer, fmt, va_arg(vargs, int));
894 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000895 break;
896 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000897 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
898 sprintf(realbuffer, fmt, va_arg(vargs, int));
899 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000900 break;
901 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000902 {
903 /* Parameter must be UTF-8 encoded.
904 In case of encoding errors, use
905 the replacement character. */
906 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000908 u = PyUnicode_DecodeUTF8(p, strlen(p),
909 "replace");
910 if (!u)
911 goto fail;
912 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
913 PyUnicode_GET_SIZE(u));
914 s += PyUnicode_GET_SIZE(u);
915 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000917 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000918 case 'U':
919 {
920 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000921 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
922 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
923 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000924 break;
925 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000926 case 'V':
927 {
928 PyObject *obj = va_arg(vargs, PyObject *);
929 const char *str = va_arg(vargs, const char *);
930 if (obj) {
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 } else {
935 appendstring(str);
936 }
937 break;
938 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000939 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000940 case 'R':
941 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000942 Py_UNICODE *ucopy;
943 Py_ssize_t usize;
944 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000945 /* unused, since we already have the result */
946 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000947 ucopy = PyUnicode_AS_UNICODE(*callresult);
948 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000949 for (upos = 0; upos<usize;)
950 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000951 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000953 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 ++callresult;
955 break;
956 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957 case 'p':
958 sprintf(buffer, "%p", va_arg(vargs, void*));
959 /* %p is ill-defined: ensure leading 0x. */
960 if (buffer[1] == 'X')
961 buffer[1] = 'x';
962 else if (buffer[1] != 'x') {
963 memmove(buffer+2, buffer, strlen(buffer)+1);
964 buffer[0] = '0';
965 buffer[1] = 'x';
966 }
967 appendstring(buffer);
968 break;
969 case '%':
970 *s++ = '%';
971 break;
972 default:
973 appendstring(p);
974 goto end;
975 }
976 } else
977 *s++ = *f;
978 }
979
980 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000981 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000982 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000983 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000984 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000985 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
986 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000987 fail:
988 if (callresults) {
989 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000990 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000991 Py_DECREF(*callresult2);
992 ++callresult2;
993 }
Christian Heimesb186d002008-03-18 15:15:01 +0000994 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000995 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000996 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000997 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000998 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000999}
1000
1001#undef appendstring
1002
1003PyObject *
1004PyUnicode_FromFormat(const char *format, ...)
1005{
1006 PyObject* ret;
1007 va_list vargs;
1008
1009#ifdef HAVE_STDARG_PROTOTYPES
1010 va_start(vargs, format);
1011#else
1012 va_start(vargs);
1013#endif
1014 ret = PyUnicode_FromFormatV(format, vargs);
1015 va_end(vargs);
1016 return ret;
1017}
1018
Martin v. Löwis18e16552006-02-15 17:27:45 +00001019Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1020 wchar_t *w,
1021 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022{
1023 if (unicode == NULL) {
1024 PyErr_BadInternalCall();
1025 return -1;
1026 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001027
1028 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030 size = PyUnicode_GET_SIZE(unicode) + 1;
1031
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032#ifdef HAVE_USABLE_WCHAR_T
1033 memcpy(w, unicode->str, size * sizeof(wchar_t));
1034#else
1035 {
1036 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001037 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001039 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 *w++ = *u++;
1041 }
1042#endif
1043
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001044 if (size > PyUnicode_GET_SIZE(unicode))
1045 return PyUnicode_GET_SIZE(unicode);
1046 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 return size;
1048}
1049
1050#endif
1051
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001052PyObject *PyUnicode_FromOrdinal(int ordinal)
1053{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001054 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001055
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001056 if (ordinal < 0 || ordinal > 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001058 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059 return NULL;
1060 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001061
1062#ifndef Py_UNICODE_WIDE
1063 if (ordinal > 0xffff) {
1064 ordinal -= 0x10000;
1065 s[0] = 0xD800 | (ordinal >> 10);
1066 s[1] = 0xDC00 | (ordinal & 0x3FF);
1067 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 }
1069#endif
1070
Hye-Shik Chang40574832004-04-06 07:24:51 +00001071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001073}
1074
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075PyObject *PyUnicode_FromObject(register PyObject *obj)
1076{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001078 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 if (PyUnicode_CheckExact(obj)) {
1080 Py_INCREF(obj);
1081 return obj;
1082 }
1083 if (PyUnicode_Check(obj)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
1088 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001089 PyErr_Format(PyExc_TypeError,
1090 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001091 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001092 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093}
1094
1095PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1096 const char *encoding,
1097 const char *errors)
1098{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001099 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001100 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001101 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 if (obj == NULL) {
1104 PyErr_BadInternalCall();
1105 return NULL;
1106 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001108 if (PyUnicode_Check(obj)) {
1109 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001110 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001111 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001112 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001113
1114 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001115 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001116 s = PyBytes_AS_STRING(obj);
1117 len = PyBytes_GET_SIZE(obj);
1118 }
1119 else if (PyByteArray_Check(obj)) {
1120 s = PyByteArray_AS_STRING(obj);
1121 len = PyByteArray_GET_SIZE(obj);
1122 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001123 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1124 /* Overwrite the error message with something more useful in
1125 case of a TypeError. */
1126 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001127 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001128 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001129 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001130 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001131 goto onError;
1132 }
Tim Petersced69f82003-09-16 20:30:58 +00001133
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001134 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 if (len == 0) {
1136 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001137 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 }
Tim Petersced69f82003-09-16 20:30:58 +00001139 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001140 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001141
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001142 return v;
1143
1144 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146}
1147
1148PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 const char *encoding,
1151 const char *errors)
1152{
1153 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001154 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001155 char lower[20]; /* Enough for any encoding name we recognize */
1156 char *l;
1157 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001158
1159 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001160 encoding = PyUnicode_GetDefaultEncoding();
1161
1162 /* Convert encoding to lower case and replace '_' with '-' in order to
1163 catch e.g. UTF_8 */
1164 e = encoding;
1165 l = lower;
1166 while (*e && l < &lower[(sizeof lower) - 2]) {
1167 if (ISUPPER(*e)) {
1168 *l++ = TOLOWER(*e++);
1169 }
1170 else if (*e == '_') {
1171 *l++ = '-';
1172 e++;
1173 }
1174 else {
1175 *l++ = *e++;
1176 }
1177 }
1178 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001179
1180 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001181 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001183 else if ((strcmp(lower, "latin-1") == 0) ||
1184 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001185 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001186#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001187 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001188 return PyUnicode_DecodeMBCS(s, size, errors);
1189#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001191 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if (strcmp(lower, "utf-16") == 0)
1193 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1194 else if (strcmp(lower, "utf-32") == 0)
1195 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196
1197 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001198 buffer = NULL;
1199 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1200 goto onError;
1201 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001209 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 onError:
1218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
1234 encoding = PyUnicode_GetDefaultEncoding();
1235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
1242 onError:
1243 return NULL;
1244}
1245
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001246PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1247 const char *encoding,
1248 const char *errors)
1249{
1250 PyObject *v;
1251
1252 if (!PyUnicode_Check(unicode)) {
1253 PyErr_BadArgument();
1254 goto onError;
1255 }
1256
1257 if (encoding == NULL)
1258 encoding = PyUnicode_GetDefaultEncoding();
1259
1260 /* Decode via the codec registry */
1261 v = PyCodec_Decode(unicode, encoding, errors);
1262 if (v == NULL)
1263 goto onError;
1264 if (!PyUnicode_Check(v)) {
1265 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001266 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001267 Py_TYPE(v)->tp_name);
1268 Py_DECREF(v);
1269 goto onError;
1270 }
1271 return v;
1272
1273 onError:
1274 return NULL;
1275}
1276
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 const char *encoding,
1280 const char *errors)
1281{
1282 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 unicode = PyUnicode_FromUnicode(s, size);
1285 if (unicode == NULL)
1286 return NULL;
1287 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1288 Py_DECREF(unicode);
1289 return v;
1290}
1291
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001292PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1293 const char *encoding,
1294 const char *errors)
1295{
1296 PyObject *v;
1297
1298 if (!PyUnicode_Check(unicode)) {
1299 PyErr_BadArgument();
1300 goto onError;
1301 }
1302
1303 if (encoding == NULL)
1304 encoding = PyUnicode_GetDefaultEncoding();
1305
1306 /* Encode via the codec registry */
1307 v = PyCodec_Encode(unicode, encoding, errors);
1308 if (v == NULL)
1309 goto onError;
1310 return v;
1311
1312 onError:
1313 return NULL;
1314}
1315
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1317 const char *encoding,
1318 const char *errors)
1319{
1320 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001321
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 if (!PyUnicode_Check(unicode)) {
1323 PyErr_BadArgument();
1324 goto onError;
1325 }
Fred Drakee4315f52000-05-09 19:53:39 +00001326
Tim Petersced69f82003-09-16 20:30:58 +00001327 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001328 encoding = PyUnicode_GetDefaultEncoding();
1329
1330 /* Shortcuts for common default encodings */
1331 if (errors == NULL) {
1332 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001333 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001334 else if (strcmp(encoding, "latin-1") == 0)
1335 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1337 else if (strcmp(encoding, "mbcs") == 0)
1338 return PyUnicode_AsMBCSString(unicode);
1339#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001340 else if (strcmp(encoding, "ascii") == 0)
1341 return PyUnicode_AsASCIIString(unicode);
1342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343
1344 /* Encode via the codec registry */
1345 v = PyCodec_Encode(unicode, encoding, errors);
1346 if (v == NULL)
1347 goto onError;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001348 if (PyByteArray_Check(v)) {
1349 char msg[100];
1350 PyOS_snprintf(msg, sizeof(msg),
1351 "encoder %s returned buffer instead of bytes",
1352 encoding);
1353 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1354 v = NULL;
1355 goto onError;
1356 }
1357 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1358 }
1359 else if (!PyBytes_Check(v)) {
1360 PyErr_Format(PyExc_TypeError,
1361 "encoder did not return a bytes object (type=%.400s)",
1362 Py_TYPE(v)->tp_name);
1363 v = NULL;
1364 }
1365 return v;
1366
1367 onError:
1368 return NULL;
1369}
1370
1371PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1372 const char *encoding,
1373 const char *errors)
1374{
1375 PyObject *v;
1376
1377 if (!PyUnicode_Check(unicode)) {
1378 PyErr_BadArgument();
1379 goto onError;
1380 }
1381
1382 if (encoding == NULL)
1383 encoding = PyUnicode_GetDefaultEncoding();
1384
1385 /* Encode via the codec registry */
1386 v = PyCodec_Encode(unicode, encoding, errors);
1387 if (v == NULL)
1388 goto onError;
1389 if (!PyUnicode_Check(v)) {
1390 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001391 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001392 Py_TYPE(v)->tp_name);
1393 Py_DECREF(v);
1394 goto onError;
1395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 onError:
1399 return NULL;
1400}
1401
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001402PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1403 const char *errors)
1404{
1405 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001406 if (v)
1407 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001408 if (errors != NULL)
1409 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001410 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001411 PyUnicode_GET_SIZE(unicode),
1412 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001413 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001414 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001415 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001416 return v;
1417}
1418
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001419PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001420PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001421 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001422 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1423}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001424
Christian Heimes5894ba72007-11-04 11:43:14 +00001425PyObject*
1426PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1427{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001428 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1429 can be undefined. If it is case, decode using UTF-8. The following assumes
1430 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1431 bootstrapping process where the codecs aren't ready yet.
1432 */
1433 if (Py_FileSystemDefaultEncoding) {
1434#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001435 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001436 return PyUnicode_DecodeMBCS(s, size, "replace");
1437 }
1438#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001439 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001440 return PyUnicode_DecodeUTF8(s, size, "replace");
1441 }
1442#endif
1443 return PyUnicode_Decode(s, size,
1444 Py_FileSystemDefaultEncoding,
1445 "replace");
1446 }
1447 else {
1448 return PyUnicode_DecodeUTF8(s, size, "replace");
1449 }
1450}
1451
Martin v. Löwis5b222132007-06-10 09:51:05 +00001452char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001453PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001454{
Christian Heimesf3863112007-11-22 07:46:41 +00001455 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001456 if (!PyUnicode_Check(unicode)) {
1457 PyErr_BadArgument();
1458 return NULL;
1459 }
Christian Heimesf3863112007-11-22 07:46:41 +00001460 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1461 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001462 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001463 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001464 *psize = PyBytes_GET_SIZE(bytes);
1465 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001466}
1467
1468char*
1469PyUnicode_AsString(PyObject *unicode)
1470{
1471 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001472}
1473
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1475{
1476 if (!PyUnicode_Check(unicode)) {
1477 PyErr_BadArgument();
1478 goto onError;
1479 }
1480 return PyUnicode_AS_UNICODE(unicode);
1481
1482 onError:
1483 return NULL;
1484}
1485
Martin v. Löwis18e16552006-02-15 17:27:45 +00001486Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487{
1488 if (!PyUnicode_Check(unicode)) {
1489 PyErr_BadArgument();
1490 goto onError;
1491 }
1492 return PyUnicode_GET_SIZE(unicode);
1493
1494 onError:
1495 return -1;
1496}
1497
Thomas Wouters78890102000-07-22 19:25:51 +00001498const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001499{
1500 return unicode_default_encoding;
1501}
1502
1503int PyUnicode_SetDefaultEncoding(const char *encoding)
1504{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001505 if (strcmp(encoding, unicode_default_encoding) != 0) {
1506 PyErr_Format(PyExc_ValueError,
1507 "Can only set default encoding to %s",
1508 unicode_default_encoding);
1509 return -1;
1510 }
Fred Drakee4315f52000-05-09 19:53:39 +00001511 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001512}
1513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001514/* error handling callback helper:
1515 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001516 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 and adjust various state variables.
1518 return 0 on success, -1 on error
1519*/
1520
1521static
1522int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1523 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001524 const char **input, const char **inend, Py_ssize_t *startinpos,
1525 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001528 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529
1530 PyObject *restuple = NULL;
1531 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001532 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001533 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001534 Py_ssize_t requiredsize;
1535 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001537 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001538 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 int res = -1;
1540
1541 if (*errorHandler == NULL) {
1542 *errorHandler = PyCodec_LookupError(errors);
1543 if (*errorHandler == NULL)
1544 goto onError;
1545 }
1546
1547 if (*exceptionObject == NULL) {
1548 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001549 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 if (*exceptionObject == NULL)
1551 goto onError;
1552 }
1553 else {
1554 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1555 goto onError;
1556 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1557 goto onError;
1558 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1559 goto onError;
1560 }
1561
1562 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1563 if (restuple == NULL)
1564 goto onError;
1565 if (!PyTuple_Check(restuple)) {
1566 PyErr_Format(PyExc_TypeError, &argparse[4]);
1567 goto onError;
1568 }
1569 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1570 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001571
1572 /* Copy back the bytes variables, which might have been modified by the
1573 callback */
1574 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1575 if (!inputobj)
1576 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001577 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001578 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1579 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001580 *input = PyBytes_AS_STRING(inputobj);
1581 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001582 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001583 /* we can DECREF safely, as the exception has another reference,
1584 so the object won't go away. */
1585 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001587 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001588 newpos = insize+newpos;
1589 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001590 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001591 goto onError;
1592 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593
1594 /* need more space? (at least enough for what we
1595 have+the replacement+the rest of the string (starting
1596 at the new input position), so we won't have to check space
1597 when there are no errors in the rest of the string) */
1598 repptr = PyUnicode_AS_UNICODE(repunicode);
1599 repsize = PyUnicode_GET_SIZE(repunicode);
1600 requiredsize = *outpos + repsize + insize-newpos;
1601 if (requiredsize > outsize) {
1602 if (requiredsize<2*outsize)
1603 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001604 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 goto onError;
1606 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1607 }
1608 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001609 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 Py_UNICODE_COPY(*outptr, repptr, repsize);
1611 *outptr += repsize;
1612 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 /* we made it! */
1615 res = 0;
1616
1617 onError:
1618 Py_XDECREF(restuple);
1619 return res;
1620}
1621
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622/* --- UTF-7 Codec -------------------------------------------------------- */
1623
1624/* see RFC2152 for details */
1625
Tim Petersced69f82003-09-16 20:30:58 +00001626static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627char utf7_special[128] = {
1628 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1629 encoded:
1630 0 - not special
1631 1 - special
1632 2 - whitespace (optional)
1633 3 - RFC2152 Set O (optional) */
1634 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1635 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1636 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1638 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1640 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1642
1643};
1644
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001645/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1646 warnings about the comparison always being false; since
1647 utf7_special[0] is 1, we can safely make that one comparison
1648 true */
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001651 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001652 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 (encodeO && (utf7_special[(c)] == 3)))
1654
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001655#define B64(n) \
1656 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1657#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001658 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001659#define UB64(c) \
1660 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1661 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001663#define ENCODE(out, ch, bits) \
1664 while (bits >= 6) { \
1665 *out++ = B64(ch >> (bits-6)); \
1666 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001667 }
1668
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001669#define DECODE(out, ch, bits, surrogate) \
1670 while (bits >= 16) { \
1671 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1672 bits -= 16; \
1673 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001674 /* We have already generated an error for the high surrogate \
1675 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001676 surrogate = 0; \
1677 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001679 it in a 16-bit character */ \
1680 surrogate = 1; \
1681 errmsg = "code pairs are not supported"; \
1682 goto utf7Error; \
1683 } else { \
1684 *out++ = outCh; \
1685 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001688PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690 const char *errors)
1691{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001692 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1693}
1694
1695PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1696 Py_ssize_t size,
1697 const char *errors,
1698 Py_ssize_t *consumed)
1699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001701 Py_ssize_t startinpos;
1702 Py_ssize_t endinpos;
1703 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 const char *e;
1705 PyUnicodeObject *unicode;
1706 Py_UNICODE *p;
1707 const char *errmsg = "";
1708 int inShift = 0;
1709 unsigned int bitsleft = 0;
1710 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001711 int surrogate = 0;
1712 PyObject *errorHandler = NULL;
1713 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714
1715 unicode = _PyUnicode_New(size);
1716 if (!unicode)
1717 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001718 if (size == 0) {
1719 if (consumed)
1720 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001722 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723
1724 p = unicode->str;
1725 e = s + size;
1726
1727 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE ch;
1729 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001730 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731
1732 if (inShift) {
1733 if ((ch == '-') || !B64CHAR(ch)) {
1734 inShift = 0;
1735 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001736
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1738 if (bitsleft >= 6) {
1739 /* The shift sequence has a partial character in it. If
1740 bitsleft < 6 then we could just classify it as padding
1741 but that is not the case here */
1742
1743 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001744 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 }
1746 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001747 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 here so indicate the potential of a misencoded character. */
1749
1750 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1751 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1752 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001753 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754 }
1755
1756 if (ch == '-') {
1757 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001758 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 inShift = 1;
1760 }
1761 } else if (SPECIAL(ch,0,0)) {
1762 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001763 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 } else {
1765 *p++ = ch;
1766 }
1767 } else {
1768 charsleft = (charsleft << 6) | UB64(ch);
1769 bitsleft += 6;
1770 s++;
1771 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1772 }
1773 }
1774 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 s++;
1777 if (s < e && *s == '-') {
1778 s++;
1779 *p++ = '+';
1780 } else
1781 {
1782 inShift = 1;
1783 bitsleft = 0;
1784 }
1785 }
1786 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001787 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001788 errmsg = "unexpected special character";
1789 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001790 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001791 }
1792 else {
1793 *p++ = ch;
1794 s++;
1795 }
1796 continue;
1797 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 outpos = p-PyUnicode_AS_UNICODE(unicode);
1799 endinpos = s-starts;
1800 if (unicode_decode_call_errorhandler(
1801 errors, &errorHandler,
1802 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001803 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 (PyObject **)&unicode, &outpos, &p))
1805 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001806 }
1807
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001808 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(unicode);
1810 endinpos = size;
1811 if (unicode_decode_call_errorhandler(
1812 errors, &errorHandler,
1813 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001814 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 if (s < e)
1818 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001820 if (consumed) {
1821 if(inShift)
1822 *consumed = startinpos;
1823 else
1824 *consumed = s-starts;
1825 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001826
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001827 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828 goto onError;
1829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 Py_XDECREF(errorHandler);
1831 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 return (PyObject *)unicode;
1833
1834onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 Py_XDECREF(errorHandler);
1836 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 Py_DECREF(unicode);
1838 return NULL;
1839}
1840
1841
1842PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001843 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 int encodeSetO,
1845 int encodeWhiteSpace,
1846 const char *errors)
1847{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001848 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001850 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001852 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 unsigned int bitsleft = 0;
1854 unsigned long charsleft = 0;
1855 char * out;
1856 char * start;
1857
1858 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001859 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860
Christian Heimes9c4756e2008-05-26 13:22:05 +00001861 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001862 if (v == NULL)
1863 return NULL;
1864
Christian Heimes9c4756e2008-05-26 13:22:05 +00001865 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001866 for (;i < size; ++i) {
1867 Py_UNICODE ch = s[i];
1868
1869 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001870 if (ch == '+') {
1871 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 *out++ = '-';
1873 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1874 charsleft = ch;
1875 bitsleft = 16;
1876 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001877 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001879 } else {
1880 *out++ = (char) ch;
1881 }
1882 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1884 *out++ = B64(charsleft << (6-bitsleft));
1885 charsleft = 0;
1886 bitsleft = 0;
1887 /* Characters not in the BASE64 set implicitly unshift the sequence
1888 so no '-' is required, except if the character is itself a '-' */
1889 if (B64CHAR(ch) || ch == '-') {
1890 *out++ = '-';
1891 }
1892 inShift = 0;
1893 *out++ = (char) ch;
1894 } else {
1895 bitsleft += 16;
1896 charsleft = (charsleft << 16) | ch;
1897 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1898
1899 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001900 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901 or '-' then the shift sequence will be terminated implicitly and we
1902 don't have to insert a '-'. */
1903
1904 if (bitsleft == 0) {
1905 if (i + 1 < size) {
1906 Py_UNICODE ch2 = s[i+1];
1907
1908 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001910 } else if (B64CHAR(ch2) || ch2 == '-') {
1911 *out++ = '-';
1912 inShift = 0;
1913 } else {
1914 inShift = 0;
1915 }
1916
1917 }
1918 else {
1919 *out++ = '-';
1920 inShift = 0;
1921 }
1922 }
Tim Petersced69f82003-09-16 20:30:58 +00001923 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001924 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001925 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 if (bitsleft) {
1927 *out++= B64(charsleft << (6-bitsleft) );
1928 *out++ = '-';
1929 }
1930
Christian Heimes72b710a2008-05-26 13:28:38 +00001931 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001932 Py_DECREF(v);
1933 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934}
1935
1936#undef SPECIAL
1937#undef B64
1938#undef B64CHAR
1939#undef UB64
1940#undef ENCODE
1941#undef DECODE
1942
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943/* --- UTF-8 Codec -------------------------------------------------------- */
1944
Tim Petersced69f82003-09-16 20:30:58 +00001945static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946char utf8_code_length[256] = {
1947 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1948 illegal prefix. see RFC 2279 for details */
1949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1961 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1963 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1964 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1965};
1966
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 const char *errors)
1970{
Walter Dörwald69652032004-09-07 20:24:22 +00001971 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1972}
1973
1974PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001975 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001976 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001977 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001981 Py_ssize_t startinpos;
1982 Py_ssize_t endinpos;
1983 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 const char *e;
1985 PyUnicodeObject *unicode;
1986 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001987 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001988 PyObject *errorHandler = NULL;
1989 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990
1991 /* Note: size will always be longer than the resulting Unicode
1992 character count */
1993 unicode = _PyUnicode_New(size);
1994 if (!unicode)
1995 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001996 if (size == 0) {
1997 if (consumed)
1998 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 /* Unpack UTF-8 encoded data */
2003 p = unicode->str;
2004 e = s + size;
2005
2006 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002007 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008
2009 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002010 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 s++;
2012 continue;
2013 }
2014
2015 n = utf8_code_length[ch];
2016
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002017 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002018 if (consumed)
2019 break;
2020 else {
2021 errmsg = "unexpected end of data";
2022 startinpos = s-starts;
2023 endinpos = size;
2024 goto utf8Error;
2025 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
2028 switch (n) {
2029
2030 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002031 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 startinpos = s-starts;
2033 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002034 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
2036 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002037 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 startinpos = s-starts;
2039 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002040 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002043 if ((s[1] & 0xc0) != 0x80) {
2044 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 startinpos = s-starts;
2046 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002047 goto utf8Error;
2048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002050 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 startinpos = s-starts;
2052 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002053 errmsg = "illegal encoding";
2054 goto utf8Error;
2055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002057 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 break;
2059
2060 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002061 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002062 (s[2] & 0xc0) != 0x80) {
2063 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 startinpos = s-starts;
2065 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002066 goto utf8Error;
2067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002069 if (ch < 0x0800) {
2070 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002071 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002072
2073 XXX For wide builds (UCS-4) we should probably try
2074 to recombine the surrogates into a single code
2075 unit.
2076 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002077 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 startinpos = s-starts;
2079 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002080 goto utf8Error;
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002083 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002084 break;
2085
2086 case 4:
2087 if ((s[1] & 0xc0) != 0x80 ||
2088 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002089 (s[3] & 0xc0) != 0x80) {
2090 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 startinpos = s-starts;
2092 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002093 goto utf8Error;
2094 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002095 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2096 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002098 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002099 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002101 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002103 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002104 startinpos = s-starts;
2105 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002106 goto utf8Error;
2107 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002108#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002109 *p++ = (Py_UNICODE)ch;
2110#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002111 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002112
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 /* translate from 10000..10FFFF to 0..FFFF */
2114 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002115
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 /* high surrogate = top 10 bits added to D800 */
2117 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002118
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002119 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002120 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002121#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 break;
2123
2124 default:
2125 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002126 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 startinpos = s-starts;
2128 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002129 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 }
2131 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002132 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002133
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 outpos = p-PyUnicode_AS_UNICODE(unicode);
2136 if (unicode_decode_call_errorhandler(
2137 errors, &errorHandler,
2138 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002139 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002140 (PyObject **)&unicode, &outpos, &p))
2141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 }
Walter Dörwald69652032004-09-07 20:24:22 +00002143 if (consumed)
2144 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
2146 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002147 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 goto onError;
2149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 Py_XDECREF(errorHandler);
2151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 return (PyObject *)unicode;
2153
2154onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002155 Py_XDECREF(errorHandler);
2156 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 Py_DECREF(unicode);
2158 return NULL;
2159}
2160
Tim Peters602f7402002-04-27 18:03:26 +00002161/* Allocation strategy: if the string is short, convert into a stack buffer
2162 and allocate exactly as much space needed at the end. Else allocate the
2163 maximum possible needed (4 result bytes per Unicode character), and return
2164 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002165*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002166PyObject *
2167PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002168 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002169 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Tim Peters602f7402002-04-27 18:03:26 +00002171#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002172
Guido van Rossum98297ee2007-11-06 21:34:58 +00002173 Py_ssize_t i; /* index into s of next input byte */
2174 PyObject *result; /* result string object */
2175 char *p; /* next free byte in output buffer */
2176 Py_ssize_t nallocated; /* number of result bytes allocated */
2177 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002178 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002179
Tim Peters602f7402002-04-27 18:03:26 +00002180 assert(s != NULL);
2181 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182
Tim Peters602f7402002-04-27 18:03:26 +00002183 if (size <= MAX_SHORT_UNICHARS) {
2184 /* Write into the stack buffer; nallocated can't overflow.
2185 * At the end, we'll allocate exactly as much heap space as it
2186 * turns out we need.
2187 */
2188 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002189 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002190 p = stackbuf;
2191 }
2192 else {
2193 /* Overallocate on the heap, and give the excess back at the end. */
2194 nallocated = size * 4;
2195 if (nallocated / 4 != size) /* overflow! */
2196 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002197 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002198 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002199 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002200 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002201 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002202
Tim Peters602f7402002-04-27 18:03:26 +00002203 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002204 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002205
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002206 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002207 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002209
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002211 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002212 *p++ = (char)(0xc0 | (ch >> 6));
2213 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002215 else {
Tim Peters602f7402002-04-27 18:03:26 +00002216 /* Encode UCS2 Unicode ordinals */
2217 if (ch < 0x10000) {
2218 /* Special case: check for high surrogate */
2219 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2220 Py_UCS4 ch2 = s[i];
2221 /* Check for low surrogate and combine the two to
2222 form a UCS4 value */
2223 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002224 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002225 i++;
2226 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002227 }
Tim Peters602f7402002-04-27 18:03:26 +00002228 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002229 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002230 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002231 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2232 *p++ = (char)(0x80 | (ch & 0x3f));
2233 continue;
2234 }
2235encodeUCS4:
2236 /* Encode UCS4 Unicode ordinals */
2237 *p++ = (char)(0xf0 | (ch >> 18));
2238 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2239 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2240 *p++ = (char)(0x80 | (ch & 0x3f));
2241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002243
Guido van Rossum98297ee2007-11-06 21:34:58 +00002244 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002245 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002246 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002247 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002248 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002249 }
2250 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002251 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002252 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002253 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002254 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002255 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002256 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002257
Tim Peters602f7402002-04-27 18:03:26 +00002258#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259}
2260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2262{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 if (!PyUnicode_Check(unicode)) {
2264 PyErr_BadArgument();
2265 return NULL;
2266 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002267 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2268 PyUnicode_GET_SIZE(unicode),
2269 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270}
2271
Walter Dörwald41980ca2007-08-16 21:55:45 +00002272/* --- UTF-32 Codec ------------------------------------------------------- */
2273
2274PyObject *
2275PyUnicode_DecodeUTF32(const char *s,
2276 Py_ssize_t size,
2277 const char *errors,
2278 int *byteorder)
2279{
2280 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2281}
2282
2283PyObject *
2284PyUnicode_DecodeUTF32Stateful(const char *s,
2285 Py_ssize_t size,
2286 const char *errors,
2287 int *byteorder,
2288 Py_ssize_t *consumed)
2289{
2290 const char *starts = s;
2291 Py_ssize_t startinpos;
2292 Py_ssize_t endinpos;
2293 Py_ssize_t outpos;
2294 PyUnicodeObject *unicode;
2295 Py_UNICODE *p;
2296#ifndef Py_UNICODE_WIDE
2297 int i, pairs;
2298#else
2299 const int pairs = 0;
2300#endif
2301 const unsigned char *q, *e;
2302 int bo = 0; /* assume native ordering by default */
2303 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002304 /* Offsets from q for retrieving bytes in the right order. */
2305#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2306 int iorder[] = {0, 1, 2, 3};
2307#else
2308 int iorder[] = {3, 2, 1, 0};
2309#endif
2310 PyObject *errorHandler = NULL;
2311 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002312 /* On narrow builds we split characters outside the BMP into two
2313 codepoints => count how much extra space we need. */
2314#ifndef Py_UNICODE_WIDE
2315 for (i = pairs = 0; i < size/4; i++)
2316 if (((Py_UCS4 *)s)[i] >= 0x10000)
2317 pairs++;
2318#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002319
2320 /* This might be one to much, because of a BOM */
2321 unicode = _PyUnicode_New((size+3)/4+pairs);
2322 if (!unicode)
2323 return NULL;
2324 if (size == 0)
2325 return (PyObject *)unicode;
2326
2327 /* Unpack UTF-32 encoded data */
2328 p = unicode->str;
2329 q = (unsigned char *)s;
2330 e = q + size;
2331
2332 if (byteorder)
2333 bo = *byteorder;
2334
2335 /* Check for BOM marks (U+FEFF) in the input and adjust current
2336 byte order setting accordingly. In native mode, the leading BOM
2337 mark is skipped, in all other modes, it is copied to the output
2338 stream as-is (giving a ZWNBSP character). */
2339 if (bo == 0) {
2340 if (size >= 4) {
2341 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2342 (q[iorder[1]] << 8) | q[iorder[0]];
2343#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2344 if (bom == 0x0000FEFF) {
2345 q += 4;
2346 bo = -1;
2347 }
2348 else if (bom == 0xFFFE0000) {
2349 q += 4;
2350 bo = 1;
2351 }
2352#else
2353 if (bom == 0x0000FEFF) {
2354 q += 4;
2355 bo = 1;
2356 }
2357 else if (bom == 0xFFFE0000) {
2358 q += 4;
2359 bo = -1;
2360 }
2361#endif
2362 }
2363 }
2364
2365 if (bo == -1) {
2366 /* force LE */
2367 iorder[0] = 0;
2368 iorder[1] = 1;
2369 iorder[2] = 2;
2370 iorder[3] = 3;
2371 }
2372 else if (bo == 1) {
2373 /* force BE */
2374 iorder[0] = 3;
2375 iorder[1] = 2;
2376 iorder[2] = 1;
2377 iorder[3] = 0;
2378 }
2379
2380 while (q < e) {
2381 Py_UCS4 ch;
2382 /* remaining bytes at the end? (size should be divisible by 4) */
2383 if (e-q<4) {
2384 if (consumed)
2385 break;
2386 errmsg = "truncated data";
2387 startinpos = ((const char *)q)-starts;
2388 endinpos = ((const char *)e)-starts;
2389 goto utf32Error;
2390 /* The remaining input chars are ignored if the callback
2391 chooses to skip the input */
2392 }
2393 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2394 (q[iorder[1]] << 8) | q[iorder[0]];
2395
2396 if (ch >= 0x110000)
2397 {
2398 errmsg = "codepoint not in range(0x110000)";
2399 startinpos = ((const char *)q)-starts;
2400 endinpos = startinpos+4;
2401 goto utf32Error;
2402 }
2403#ifndef Py_UNICODE_WIDE
2404 if (ch >= 0x10000)
2405 {
2406 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2407 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2408 }
2409 else
2410#endif
2411 *p++ = ch;
2412 q += 4;
2413 continue;
2414 utf32Error:
2415 outpos = p-PyUnicode_AS_UNICODE(unicode);
2416 if (unicode_decode_call_errorhandler(
2417 errors, &errorHandler,
2418 "utf32", errmsg,
2419 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2420 (PyObject **)&unicode, &outpos, &p))
2421 goto onError;
2422 }
2423
2424 if (byteorder)
2425 *byteorder = bo;
2426
2427 if (consumed)
2428 *consumed = (const char *)q-starts;
2429
2430 /* Adjust length */
2431 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2432 goto onError;
2433
2434 Py_XDECREF(errorHandler);
2435 Py_XDECREF(exc);
2436 return (PyObject *)unicode;
2437
2438onError:
2439 Py_DECREF(unicode);
2440 Py_XDECREF(errorHandler);
2441 Py_XDECREF(exc);
2442 return NULL;
2443}
2444
2445PyObject *
2446PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2447 Py_ssize_t size,
2448 const char *errors,
2449 int byteorder)
2450{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002451 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002452 unsigned char *p;
2453#ifndef Py_UNICODE_WIDE
2454 int i, pairs;
2455#else
2456 const int pairs = 0;
2457#endif
2458 /* Offsets from p for storing byte pairs in the right order. */
2459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2460 int iorder[] = {0, 1, 2, 3};
2461#else
2462 int iorder[] = {3, 2, 1, 0};
2463#endif
2464
2465#define STORECHAR(CH) \
2466 do { \
2467 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2468 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2469 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2470 p[iorder[0]] = (CH) & 0xff; \
2471 p += 4; \
2472 } while(0)
2473
2474 /* In narrow builds we can output surrogate pairs as one codepoint,
2475 so we need less space. */
2476#ifndef Py_UNICODE_WIDE
2477 for (i = pairs = 0; i < size-1; i++)
2478 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2479 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2480 pairs++;
2481#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002482 v = PyByteArray_FromStringAndSize(NULL,
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483 4 * (size - pairs + (byteorder == 0)));
2484 if (v == NULL)
2485 return NULL;
2486
Christian Heimes9c4756e2008-05-26 13:22:05 +00002487 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002488 if (byteorder == 0)
2489 STORECHAR(0xFEFF);
2490 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002491 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002492
2493 if (byteorder == -1) {
2494 /* force LE */
2495 iorder[0] = 0;
2496 iorder[1] = 1;
2497 iorder[2] = 2;
2498 iorder[3] = 3;
2499 }
2500 else if (byteorder == 1) {
2501 /* force BE */
2502 iorder[0] = 3;
2503 iorder[1] = 2;
2504 iorder[2] = 1;
2505 iorder[3] = 0;
2506 }
2507
2508 while (size-- > 0) {
2509 Py_UCS4 ch = *s++;
2510#ifndef Py_UNICODE_WIDE
2511 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2512 Py_UCS4 ch2 = *s;
2513 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2514 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2515 s++;
2516 size--;
2517 }
2518 }
2519#endif
2520 STORECHAR(ch);
2521 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002522
2523 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002524 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002525 Py_DECREF(v);
2526 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002527#undef STORECHAR
2528}
2529
2530PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2531{
2532 if (!PyUnicode_Check(unicode)) {
2533 PyErr_BadArgument();
2534 return NULL;
2535 }
2536 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2537 PyUnicode_GET_SIZE(unicode),
2538 NULL,
2539 0);
2540}
2541
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542/* --- UTF-16 Codec ------------------------------------------------------- */
2543
Tim Peters772747b2001-08-09 22:21:55 +00002544PyObject *
2545PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002546 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002547 const char *errors,
2548 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549{
Walter Dörwald69652032004-09-07 20:24:22 +00002550 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2551}
2552
2553PyObject *
2554PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002555 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002556 const char *errors,
2557 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002558 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002561 Py_ssize_t startinpos;
2562 Py_ssize_t endinpos;
2563 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 PyUnicodeObject *unicode;
2565 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002566 const unsigned char *q, *e;
2567 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002568 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002569 /* Offsets from q for retrieving byte pairs in the right order. */
2570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2571 int ihi = 1, ilo = 0;
2572#else
2573 int ihi = 0, ilo = 1;
2574#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002575 PyObject *errorHandler = NULL;
2576 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577
2578 /* Note: size will always be longer than the resulting Unicode
2579 character count */
2580 unicode = _PyUnicode_New(size);
2581 if (!unicode)
2582 return NULL;
2583 if (size == 0)
2584 return (PyObject *)unicode;
2585
2586 /* Unpack UTF-16 encoded data */
2587 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002588 q = (unsigned char *)s;
2589 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590
2591 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002592 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002594 /* Check for BOM marks (U+FEFF) in the input and adjust current
2595 byte order setting accordingly. In native mode, the leading BOM
2596 mark is skipped, in all other modes, it is copied to the output
2597 stream as-is (giving a ZWNBSP character). */
2598 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002599 if (size >= 2) {
2600 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002601#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002602 if (bom == 0xFEFF) {
2603 q += 2;
2604 bo = -1;
2605 }
2606 else if (bom == 0xFFFE) {
2607 q += 2;
2608 bo = 1;
2609 }
Tim Petersced69f82003-09-16 20:30:58 +00002610#else
Walter Dörwald69652032004-09-07 20:24:22 +00002611 if (bom == 0xFEFF) {
2612 q += 2;
2613 bo = 1;
2614 }
2615 else if (bom == 0xFFFE) {
2616 q += 2;
2617 bo = -1;
2618 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002619#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002620 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622
Tim Peters772747b2001-08-09 22:21:55 +00002623 if (bo == -1) {
2624 /* force LE */
2625 ihi = 1;
2626 ilo = 0;
2627 }
2628 else if (bo == 1) {
2629 /* force BE */
2630 ihi = 0;
2631 ilo = 1;
2632 }
2633
2634 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002636 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002638 if (consumed)
2639 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 errmsg = "truncated data";
2641 startinpos = ((const char *)q)-starts;
2642 endinpos = ((const char *)e)-starts;
2643 goto utf16Error;
2644 /* The remaining input chars are ignored if the callback
2645 chooses to skip the input */
2646 }
2647 ch = (q[ihi] << 8) | q[ilo];
2648
Tim Peters772747b2001-08-09 22:21:55 +00002649 q += 2;
2650
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 if (ch < 0xD800 || ch > 0xDFFF) {
2652 *p++ = ch;
2653 continue;
2654 }
2655
2656 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if (q >= e) {
2658 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 startinpos = (((const char *)q)-2)-starts;
2660 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002661 goto utf16Error;
2662 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002663 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002664 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2665 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002666 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002667#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002668 *p++ = ch;
2669 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002670#else
2671 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002673 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002674 }
2675 else {
2676 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 startinpos = (((const char *)q)-4)-starts;
2678 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002679 goto utf16Error;
2680 }
2681
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002683 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002684 startinpos = (((const char *)q)-2)-starts;
2685 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002686 /* Fall through to report the error */
2687
2688 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 outpos = p-PyUnicode_AS_UNICODE(unicode);
2690 if (unicode_decode_call_errorhandler(
2691 errors, &errorHandler,
2692 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002693 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 }
2697
2698 if (byteorder)
2699 *byteorder = bo;
2700
Walter Dörwald69652032004-09-07 20:24:22 +00002701 if (consumed)
2702 *consumed = (const char *)q-starts;
2703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002705 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 goto onError;
2707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 Py_XDECREF(errorHandler);
2709 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 return (PyObject *)unicode;
2711
2712onError:
2713 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 Py_XDECREF(errorHandler);
2715 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 return NULL;
2717}
2718
Tim Peters772747b2001-08-09 22:21:55 +00002719PyObject *
2720PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002721 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002722 const char *errors,
2723 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002725 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002726 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002727#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002728 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002729#else
2730 const int pairs = 0;
2731#endif
Tim Peters772747b2001-08-09 22:21:55 +00002732 /* Offsets from p for storing byte pairs in the right order. */
2733#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2734 int ihi = 1, ilo = 0;
2735#else
2736 int ihi = 0, ilo = 1;
2737#endif
2738
2739#define STORECHAR(CH) \
2740 do { \
2741 p[ihi] = ((CH) >> 8) & 0xff; \
2742 p[ilo] = (CH) & 0xff; \
2743 p += 2; \
2744 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002747 for (i = pairs = 0; i < size; i++)
2748 if (s[i] >= 0x10000)
2749 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002750#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002751 v = PyByteArray_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002752 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 if (v == NULL)
2754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755
Christian Heimes9c4756e2008-05-26 13:22:05 +00002756 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002758 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002759 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002760 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002761
2762 if (byteorder == -1) {
2763 /* force LE */
2764 ihi = 1;
2765 ilo = 0;
2766 }
2767 else if (byteorder == 1) {
2768 /* force BE */
2769 ihi = 0;
2770 ilo = 1;
2771 }
2772
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002773 while (size-- > 0) {
2774 Py_UNICODE ch = *s++;
2775 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002776#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002777 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002778 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2779 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002781#endif
Tim Peters772747b2001-08-09 22:21:55 +00002782 STORECHAR(ch);
2783 if (ch2)
2784 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002785 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002786
2787 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002788 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002789 Py_DECREF(v);
2790 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002791#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792}
2793
2794PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2795{
2796 if (!PyUnicode_Check(unicode)) {
2797 PyErr_BadArgument();
2798 return NULL;
2799 }
2800 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2801 PyUnicode_GET_SIZE(unicode),
2802 NULL,
2803 0);
2804}
2805
2806/* --- Unicode Escape Codec ----------------------------------------------- */
2807
Fredrik Lundh06d12682001-01-24 07:59:11 +00002808static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002809
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002811 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 const char *errors)
2813{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002815 Py_ssize_t startinpos;
2816 Py_ssize_t endinpos;
2817 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 char* message;
2823 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 PyObject *errorHandler = NULL;
2825 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 /* Escaped strings will always be longer than the resulting
2828 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 length after conversion to the true value.
2830 (but if the error callback returns a long replacement string
2831 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 v = _PyUnicode_New(size);
2833 if (v == NULL)
2834 goto onError;
2835 if (size == 0)
2836 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002840
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 while (s < end) {
2842 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002843 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845
2846 /* Non-escape characters are interpreted as Unicode ordinals */
2847 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002848 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 continue;
2850 }
2851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 /* \ - Escapes */
2854 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002855 c = *s++;
2856 if (s > end)
2857 c = '\0'; /* Invalid after \ */
2858 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859
2860 /* \x escapes */
2861 case '\n': break;
2862 case '\\': *p++ = '\\'; break;
2863 case '\'': *p++ = '\''; break;
2864 case '\"': *p++ = '\"'; break;
2865 case 'b': *p++ = '\b'; break;
2866 case 'f': *p++ = '\014'; break; /* FF */
2867 case 't': *p++ = '\t'; break;
2868 case 'n': *p++ = '\n'; break;
2869 case 'r': *p++ = '\r'; break;
2870 case 'v': *p++ = '\013'; break; /* VT */
2871 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2872
2873 /* \OOO (octal) escapes */
2874 case '0': case '1': case '2': case '3':
2875 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002876 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002877 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002878 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002879 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002880 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002882 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 break;
2884
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 /* hex escapes */
2886 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 digits = 2;
2889 message = "truncated \\xXX escape";
2890 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 digits = 4;
2895 message = "truncated \\uXXXX escape";
2896 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897
Fredrik Lundhccc74732001-02-18 22:13:49 +00002898 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002899 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002900 digits = 8;
2901 message = "truncated \\UXXXXXXXX escape";
2902 hexescape:
2903 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 outpos = p-PyUnicode_AS_UNICODE(v);
2905 if (s+digits>end) {
2906 endinpos = size;
2907 if (unicode_decode_call_errorhandler(
2908 errors, &errorHandler,
2909 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002910 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 (PyObject **)&v, &outpos, &p))
2912 goto onError;
2913 goto nextByte;
2914 }
2915 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002917 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 endinpos = (s+i+1)-starts;
2919 if (unicode_decode_call_errorhandler(
2920 errors, &errorHandler,
2921 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002922 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002924 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002926 }
2927 chr = (chr<<4) & ~0xF;
2928 if (c >= '0' && c <= '9')
2929 chr += c - '0';
2930 else if (c >= 'a' && c <= 'f')
2931 chr += 10 + c - 'a';
2932 else
2933 chr += 10 + c - 'A';
2934 }
2935 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002936 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 /* _decoding_error will have already written into the
2938 target buffer. */
2939 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002941 /* when we get here, chr is a 32-bit unicode character */
2942 if (chr <= 0xffff)
2943 /* UCS-2 character */
2944 *p++ = (Py_UNICODE) chr;
2945 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002946 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002947 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002948#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002949 *p++ = chr;
2950#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002951 chr -= 0x10000L;
2952 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002953 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002954#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002955 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 endinpos = s-starts;
2957 outpos = p-PyUnicode_AS_UNICODE(v);
2958 if (unicode_decode_call_errorhandler(
2959 errors, &errorHandler,
2960 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002961 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002963 goto onError;
2964 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002965 break;
2966
2967 /* \N{name} */
2968 case 'N':
2969 message = "malformed \\N character escape";
2970 if (ucnhash_CAPI == NULL) {
2971 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002972 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002973 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002974 if (m == NULL)
2975 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002976 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002977 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002978 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002979 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002980 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002981 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002982 if (ucnhash_CAPI == NULL)
2983 goto ucnhashError;
2984 }
2985 if (*s == '{') {
2986 const char *start = s+1;
2987 /* look for the closing brace */
2988 while (*s != '}' && s < end)
2989 s++;
2990 if (s > start && s < end && *s == '}') {
2991 /* found a name. look it up in the unicode database */
2992 message = "unknown Unicode character name";
2993 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002994 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002995 goto store;
2996 }
2997 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 endinpos = s-starts;
2999 outpos = p-PyUnicode_AS_UNICODE(v);
3000 if (unicode_decode_call_errorhandler(
3001 errors, &errorHandler,
3002 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003003 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003005 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003006 break;
3007
3008 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003009 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003010 message = "\\ at end of string";
3011 s--;
3012 endinpos = s-starts;
3013 outpos = p-PyUnicode_AS_UNICODE(v);
3014 if (unicode_decode_call_errorhandler(
3015 errors, &errorHandler,
3016 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003017 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003019 goto onError;
3020 }
3021 else {
3022 *p++ = '\\';
3023 *p++ = (unsigned char)s[-1];
3024 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003025 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 nextByte:
3028 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003030 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003032 Py_XDECREF(errorHandler);
3033 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003035
Fredrik Lundhccc74732001-02-18 22:13:49 +00003036ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003037 PyErr_SetString(
3038 PyExc_UnicodeError,
3039 "\\N escapes not supported (can't load unicodedata module)"
3040 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003041 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_XDECREF(errorHandler);
3043 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003044 return NULL;
3045
Fredrik Lundhccc74732001-02-18 22:13:49 +00003046onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 return NULL;
3051}
3052
3053/* Return a Unicode-Escape string version of the Unicode object.
3054
3055 If quotes is true, the string is enclosed in u"" or u'' quotes as
3056 appropriate.
3057
3058*/
3059
Thomas Wouters477c8d52006-05-27 19:21:47 +00003060Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3061 Py_ssize_t size,
3062 Py_UNICODE ch)
3063{
3064 /* like wcschr, but doesn't stop at NULL characters */
3065
3066 while (size-- > 0) {
3067 if (*s == ch)
3068 return s;
3069 s++;
3070 }
3071
3072 return NULL;
3073}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003074
Walter Dörwald79e913e2007-05-12 11:08:06 +00003075static const char *hexdigits = "0123456789abcdef";
3076
3077PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3078 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003080 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082
Thomas Wouters89f507f2006-12-13 04:49:30 +00003083 /* XXX(nnorwitz): rather than over-allocating, it would be
3084 better to choose a different scheme. Perhaps scan the
3085 first N-chars of the string and allocate based on that size.
3086 */
3087 /* Initial allocation is based on the longest-possible unichr
3088 escape.
3089
3090 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3091 unichr, so in this case it's the longest unichr escape. In
3092 narrow (UTF-16) builds this is five chars per source unichr
3093 since there are two unichrs in the surrogate pair, so in narrow
3094 (UTF-16) builds it's not the longest unichr escape.
3095
3096 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3097 so in the narrow (UTF-16) build case it's the longest unichr
3098 escape.
3099 */
3100
Christian Heimes9c4756e2008-05-26 13:22:05 +00003101 repr = PyByteArray_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003102#ifdef Py_UNICODE_WIDE
3103 + 10*size
3104#else
3105 + 6*size
3106#endif
3107 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 if (repr == NULL)
3109 return NULL;
3110
Christian Heimes9c4756e2008-05-26 13:22:05 +00003111 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 while (size-- > 0) {
3114 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003115
Walter Dörwald79e913e2007-05-12 11:08:06 +00003116 /* Escape backslashes */
3117 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 *p++ = '\\';
3119 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003120 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003121 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003122
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003123#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003124 /* Map 21-bit characters to '\U00xxxxxx' */
3125 else if (ch >= 0x10000) {
3126 *p++ = '\\';
3127 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003128 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3129 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3130 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3131 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3132 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3133 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3134 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3135 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003136 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003137 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003138#else
3139 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003140 else if (ch >= 0xD800 && ch < 0xDC00) {
3141 Py_UNICODE ch2;
3142 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003143
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003144 ch2 = *s++;
3145 size--;
3146 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3147 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3148 *p++ = '\\';
3149 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003150 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3151 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3152 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3153 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3154 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3155 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3156 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3157 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003158 continue;
3159 }
3160 /* Fall through: isolated surrogates are copied as-is */
3161 s--;
3162 size++;
3163 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003164#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003167 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 *p++ = '\\';
3169 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003170 *p++ = hexdigits[(ch >> 12) & 0x000F];
3171 *p++ = hexdigits[(ch >> 8) & 0x000F];
3172 *p++ = hexdigits[(ch >> 4) & 0x000F];
3173 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003175
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003176 /* Map special whitespace to '\t', \n', '\r' */
3177 else if (ch == '\t') {
3178 *p++ = '\\';
3179 *p++ = 't';
3180 }
3181 else if (ch == '\n') {
3182 *p++ = '\\';
3183 *p++ = 'n';
3184 }
3185 else if (ch == '\r') {
3186 *p++ = '\\';
3187 *p++ = 'r';
3188 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003189
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003190 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003191 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003193 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003194 *p++ = hexdigits[(ch >> 4) & 0x000F];
3195 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003196 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003197
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 /* Copy everything else as-is */
3199 else
3200 *p++ = (char) ch;
3201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202
Christian Heimes72b710a2008-05-26 13:28:38 +00003203 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003204 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003205 Py_DECREF(repr);
3206 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207}
3208
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3210{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003211 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 return NULL;
3215 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003216 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3217 PyUnicode_GET_SIZE(unicode));
3218
3219 if (!s)
3220 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003221 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003222 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003223 Py_DECREF(s);
3224 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225}
3226
3227/* --- Raw Unicode Escape Codec ------------------------------------------- */
3228
3229PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003230 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 const char *errors)
3232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003234 Py_ssize_t startinpos;
3235 Py_ssize_t endinpos;
3236 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 const char *end;
3240 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 PyObject *errorHandler = NULL;
3242 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003243
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 /* Escaped strings will always be longer than the resulting
3245 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 length after conversion to the true value. (But decoding error
3247 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 v = _PyUnicode_New(size);
3249 if (v == NULL)
3250 goto onError;
3251 if (size == 0)
3252 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 end = s + size;
3255 while (s < end) {
3256 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003257 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260
3261 /* Non-escape characters are interpreted as Unicode ordinals */
3262 if (*s != '\\') {
3263 *p++ = (unsigned char)*s++;
3264 continue;
3265 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
3268 /* \u-escapes are only interpreted iff the number of leading
3269 backslashes if odd */
3270 bs = s;
3271 for (;s < end;) {
3272 if (*s != '\\')
3273 break;
3274 *p++ = (unsigned char)*s++;
3275 }
3276 if (((s - bs) & 1) == 0 ||
3277 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 continue;
3280 }
3281 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 s++;
3284
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003285 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003287 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003289 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 endinpos = s-starts;
3291 if (unicode_decode_call_errorhandler(
3292 errors, &errorHandler,
3293 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003294 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 }
3299 x = (x<<4) & ~0xF;
3300 if (c >= '0' && c <= '9')
3301 x += c - '0';
3302 else if (c >= 'a' && c <= 'f')
3303 x += 10 + c - 'a';
3304 else
3305 x += 10 + c - 'A';
3306 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003307 if (x <= 0xffff)
3308 /* UCS-2 character */
3309 *p++ = (Py_UNICODE) x;
3310 else if (x <= 0x10ffff) {
3311 /* UCS-4 character. Either store directly, or as
3312 surrogate pair. */
3313#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003314 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003315#else
3316 x -= 0x10000L;
3317 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3318 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3319#endif
3320 } else {
3321 endinpos = s-starts;
3322 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003323 if (unicode_decode_call_errorhandler(
3324 errors, &errorHandler,
3325 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003326 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003327 (PyObject **)&v, &outpos, &p))
3328 goto onError;
3329 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 nextByte:
3331 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003333 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003334 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 Py_XDECREF(errorHandler);
3336 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 onError:
3340 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 Py_XDECREF(errorHandler);
3342 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 return NULL;
3344}
3345
3346PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003349 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 char *p;
3351 char *q;
3352
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003353#ifdef Py_UNICODE_WIDE
Christian Heimes9c4756e2008-05-26 13:22:05 +00003354 repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003355#else
Christian Heimes9c4756e2008-05-26 13:22:05 +00003356 repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003357#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 if (repr == NULL)
3359 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003360 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003361 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362
Christian Heimes9c4756e2008-05-26 13:22:05 +00003363 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 while (size-- > 0) {
3365 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003366#ifdef Py_UNICODE_WIDE
3367 /* Map 32-bit characters to '\Uxxxxxxxx' */
3368 if (ch >= 0x10000) {
3369 *p++ = '\\';
3370 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003371 *p++ = hexdigits[(ch >> 28) & 0xf];
3372 *p++ = hexdigits[(ch >> 24) & 0xf];
3373 *p++ = hexdigits[(ch >> 20) & 0xf];
3374 *p++ = hexdigits[(ch >> 16) & 0xf];
3375 *p++ = hexdigits[(ch >> 12) & 0xf];
3376 *p++ = hexdigits[(ch >> 8) & 0xf];
3377 *p++ = hexdigits[(ch >> 4) & 0xf];
3378 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003379 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003380 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003381#else
3382 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3383 if (ch >= 0xD800 && ch < 0xDC00) {
3384 Py_UNICODE ch2;
3385 Py_UCS4 ucs;
3386
3387 ch2 = *s++;
3388 size--;
3389 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3390 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3391 *p++ = '\\';
3392 *p++ = 'U';
3393 *p++ = hexdigits[(ucs >> 28) & 0xf];
3394 *p++ = hexdigits[(ucs >> 24) & 0xf];
3395 *p++ = hexdigits[(ucs >> 20) & 0xf];
3396 *p++ = hexdigits[(ucs >> 16) & 0xf];
3397 *p++ = hexdigits[(ucs >> 12) & 0xf];
3398 *p++ = hexdigits[(ucs >> 8) & 0xf];
3399 *p++ = hexdigits[(ucs >> 4) & 0xf];
3400 *p++ = hexdigits[ucs & 0xf];
3401 continue;
3402 }
3403 /* Fall through: isolated surrogates are copied as-is */
3404 s--;
3405 size++;
3406 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003407#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 /* Map 16-bit characters to '\uxxxx' */
3409 if (ch >= 256) {
3410 *p++ = '\\';
3411 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003412 *p++ = hexdigits[(ch >> 12) & 0xf];
3413 *p++ = hexdigits[(ch >> 8) & 0xf];
3414 *p++ = hexdigits[(ch >> 4) & 0xf];
3415 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 }
3417 /* Copy everything else as-is */
3418 else
3419 *p++ = (char) ch;
3420 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003421 size = p - q;
3422
3423 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003424 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003425 Py_DECREF(repr);
3426 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427}
3428
3429PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3430{
Walter Dörwald711005d2007-05-12 12:03:26 +00003431 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003433 PyErr_BadArgument();
3434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003436 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3437 PyUnicode_GET_SIZE(unicode));
3438
3439 if (!s)
3440 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003441 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003442 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003443 Py_DECREF(s);
3444 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445}
3446
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003447/* --- Unicode Internal Codec ------------------------------------------- */
3448
3449PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003450 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003451 const char *errors)
3452{
3453 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003454 Py_ssize_t startinpos;
3455 Py_ssize_t endinpos;
3456 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003457 PyUnicodeObject *v;
3458 Py_UNICODE *p;
3459 const char *end;
3460 const char *reason;
3461 PyObject *errorHandler = NULL;
3462 PyObject *exc = NULL;
3463
Neal Norwitzd43069c2006-01-08 01:12:10 +00003464#ifdef Py_UNICODE_WIDE
3465 Py_UNICODE unimax = PyUnicode_GetMax();
3466#endif
3467
Thomas Wouters89f507f2006-12-13 04:49:30 +00003468 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003469 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3470 if (v == NULL)
3471 goto onError;
3472 if (PyUnicode_GetSize((PyObject *)v) == 0)
3473 return (PyObject *)v;
3474 p = PyUnicode_AS_UNICODE(v);
3475 end = s + size;
3476
3477 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003478 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003479 /* We have to sanity check the raw data, otherwise doom looms for
3480 some malformed UCS-4 data. */
3481 if (
3482 #ifdef Py_UNICODE_WIDE
3483 *p > unimax || *p < 0 ||
3484 #endif
3485 end-s < Py_UNICODE_SIZE
3486 )
3487 {
3488 startinpos = s - starts;
3489 if (end-s < Py_UNICODE_SIZE) {
3490 endinpos = end-starts;
3491 reason = "truncated input";
3492 }
3493 else {
3494 endinpos = s - starts + Py_UNICODE_SIZE;
3495 reason = "illegal code point (> 0x10FFFF)";
3496 }
3497 outpos = p - PyUnicode_AS_UNICODE(v);
3498 if (unicode_decode_call_errorhandler(
3499 errors, &errorHandler,
3500 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003501 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003502 (PyObject **)&v, &outpos, &p)) {
3503 goto onError;
3504 }
3505 }
3506 else {
3507 p++;
3508 s += Py_UNICODE_SIZE;
3509 }
3510 }
3511
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003512 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003513 goto onError;
3514 Py_XDECREF(errorHandler);
3515 Py_XDECREF(exc);
3516 return (PyObject *)v;
3517
3518 onError:
3519 Py_XDECREF(v);
3520 Py_XDECREF(errorHandler);
3521 Py_XDECREF(exc);
3522 return NULL;
3523}
3524
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525/* --- Latin-1 Codec ------------------------------------------------------ */
3526
3527PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 const char *errors)
3530{
3531 PyUnicodeObject *v;
3532 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003535 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003536 Py_UNICODE r = *(unsigned char*)s;
3537 return PyUnicode_FromUnicode(&r, 1);
3538 }
3539
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 v = _PyUnicode_New(size);
3541 if (v == NULL)
3542 goto onError;
3543 if (size == 0)
3544 return (PyObject *)v;
3545 p = PyUnicode_AS_UNICODE(v);
3546 while (size-- > 0)
3547 *p++ = (unsigned char)*s++;
3548 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003549
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 onError:
3551 Py_XDECREF(v);
3552 return NULL;
3553}
3554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555/* create or adjust a UnicodeEncodeError */
3556static void make_encode_exception(PyObject **exceptionObject,
3557 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 const Py_UNICODE *unicode, Py_ssize_t size,
3559 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 if (*exceptionObject == NULL) {
3563 *exceptionObject = PyUnicodeEncodeError_Create(
3564 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 }
3566 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3568 goto onError;
3569 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3570 goto onError;
3571 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3572 goto onError;
3573 return;
3574 onError:
3575 Py_DECREF(*exceptionObject);
3576 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 }
3578}
3579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580/* raises a UnicodeEncodeError */
3581static void raise_encode_exception(PyObject **exceptionObject,
3582 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003583 const Py_UNICODE *unicode, Py_ssize_t size,
3584 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 const char *reason)
3586{
3587 make_encode_exception(exceptionObject,
3588 encoding, unicode, size, startpos, endpos, reason);
3589 if (*exceptionObject != NULL)
3590 PyCodec_StrictErrors(*exceptionObject);
3591}
3592
3593/* error handling callback helper:
3594 build arguments, call the callback and check the arguments,
3595 put the result into newpos and return the replacement string, which
3596 has to be freed by the caller */
3597static PyObject *unicode_encode_call_errorhandler(const char *errors,
3598 PyObject **errorHandler,
3599 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003600 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3601 Py_ssize_t startpos, Py_ssize_t endpos,
3602 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003604 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605
3606 PyObject *restuple;
3607 PyObject *resunicode;
3608
3609 if (*errorHandler == NULL) {
3610 *errorHandler = PyCodec_LookupError(errors);
3611 if (*errorHandler == NULL)
3612 return NULL;
3613 }
3614
3615 make_encode_exception(exceptionObject,
3616 encoding, unicode, size, startpos, endpos, reason);
3617 if (*exceptionObject == NULL)
3618 return NULL;
3619
3620 restuple = PyObject_CallFunctionObjArgs(
3621 *errorHandler, *exceptionObject, NULL);
3622 if (restuple == NULL)
3623 return NULL;
3624 if (!PyTuple_Check(restuple)) {
3625 PyErr_Format(PyExc_TypeError, &argparse[4]);
3626 Py_DECREF(restuple);
3627 return NULL;
3628 }
3629 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3630 &resunicode, newpos)) {
3631 Py_DECREF(restuple);
3632 return NULL;
3633 }
3634 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003635 *newpos = size+*newpos;
3636 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003637 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003638 Py_DECREF(restuple);
3639 return NULL;
3640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 Py_INCREF(resunicode);
3642 Py_DECREF(restuple);
3643 return resunicode;
3644}
3645
3646static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003647 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 const char *errors,
3649 int limit)
3650{
3651 /* output object */
3652 PyObject *res;
3653 /* pointers to the beginning and end+1 of input */
3654 const Py_UNICODE *startp = p;
3655 const Py_UNICODE *endp = p + size;
3656 /* pointer to the beginning of the unencodable characters */
3657 /* const Py_UNICODE *badp = NULL; */
3658 /* pointer into the output */
3659 char *str;
3660 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003661 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003662 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3663 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 PyObject *errorHandler = NULL;
3665 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003666 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 /* the following variable is used for caching string comparisons
3668 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3669 int known_errorHandler = -1;
3670
3671 /* allocate enough for a simple encoding without
3672 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003673 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003674 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003675 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003677 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003678 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 ressize = size;
3680
3681 while (p<endp) {
3682 Py_UNICODE c = *p;
3683
3684 /* can we encode this? */
3685 if (c<limit) {
3686 /* no overflow check, because we know that the space is enough */
3687 *str++ = (char)c;
3688 ++p;
3689 }
3690 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003691 Py_ssize_t unicodepos = p-startp;
3692 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003694 Py_ssize_t repsize;
3695 Py_ssize_t newpos;
3696 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 Py_UNICODE *uni2;
3698 /* startpos for collecting unencodable chars */
3699 const Py_UNICODE *collstart = p;
3700 const Py_UNICODE *collend = p;
3701 /* find all unecodable characters */
3702 while ((collend < endp) && ((*collend)>=limit))
3703 ++collend;
3704 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3705 if (known_errorHandler==-1) {
3706 if ((errors==NULL) || (!strcmp(errors, "strict")))
3707 known_errorHandler = 1;
3708 else if (!strcmp(errors, "replace"))
3709 known_errorHandler = 2;
3710 else if (!strcmp(errors, "ignore"))
3711 known_errorHandler = 3;
3712 else if (!strcmp(errors, "xmlcharrefreplace"))
3713 known_errorHandler = 4;
3714 else
3715 known_errorHandler = 0;
3716 }
3717 switch (known_errorHandler) {
3718 case 1: /* strict */
3719 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3720 goto onError;
3721 case 2: /* replace */
3722 while (collstart++<collend)
3723 *str++ = '?'; /* fall through */
3724 case 3: /* ignore */
3725 p = collend;
3726 break;
3727 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003728 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 /* determine replacement size (temporarily (mis)uses p) */
3730 for (p = collstart, repsize = 0; p < collend; ++p) {
3731 if (*p<10)
3732 repsize += 2+1+1;
3733 else if (*p<100)
3734 repsize += 2+2+1;
3735 else if (*p<1000)
3736 repsize += 2+3+1;
3737 else if (*p<10000)
3738 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003739#ifndef Py_UNICODE_WIDE
3740 else
3741 repsize += 2+5+1;
3742#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 else if (*p<100000)
3744 repsize += 2+5+1;
3745 else if (*p<1000000)
3746 repsize += 2+6+1;
3747 else
3748 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003749#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 }
3751 requiredsize = respos+repsize+(endp-collend);
3752 if (requiredsize > ressize) {
3753 if (requiredsize<2*ressize)
3754 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003755 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003757 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 ressize = requiredsize;
3759 }
3760 /* generate replacement (temporarily (mis)uses p) */
3761 for (p = collstart; p < collend; ++p) {
3762 str += sprintf(str, "&#%d;", (int)*p);
3763 }
3764 p = collend;
3765 break;
3766 default:
3767 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3768 encoding, reason, startp, size, &exc,
3769 collstart-startp, collend-startp, &newpos);
3770 if (repunicode == NULL)
3771 goto onError;
3772 /* need more space? (at least enough for what we
3773 have+the replacement+the rest of the string, so
3774 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003775 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 repsize = PyUnicode_GET_SIZE(repunicode);
3777 requiredsize = respos+repsize+(endp-collend);
3778 if (requiredsize > ressize) {
3779 if (requiredsize<2*ressize)
3780 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003781 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_DECREF(repunicode);
3783 goto onError;
3784 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003785 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 ressize = requiredsize;
3787 }
3788 /* check if there is anything unencodable in the replacement
3789 and copy it to the output */
3790 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3791 c = *uni2;
3792 if (c >= limit) {
3793 raise_encode_exception(&exc, encoding, startp, size,
3794 unicodepos, unicodepos+1, reason);
3795 Py_DECREF(repunicode);
3796 goto onError;
3797 }
3798 *str = (char)c;
3799 }
3800 p = startp + newpos;
3801 Py_DECREF(repunicode);
3802 }
3803 }
3804 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003805 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003806 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003807 onError:
3808 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003811 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812}
3813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 const char *errors)
3817{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819}
3820
3821PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3822{
3823 if (!PyUnicode_Check(unicode)) {
3824 PyErr_BadArgument();
3825 return NULL;
3826 }
3827 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3828 PyUnicode_GET_SIZE(unicode),
3829 NULL);
3830}
3831
3832/* --- 7-bit ASCII Codec -------------------------------------------------- */
3833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003835 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 const char *errors)
3837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 PyUnicodeObject *v;
3840 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003841 Py_ssize_t startinpos;
3842 Py_ssize_t endinpos;
3843 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 const char *e;
3845 PyObject *errorHandler = NULL;
3846 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003849 if (size == 1 && *(unsigned char*)s < 128) {
3850 Py_UNICODE r = *(unsigned char*)s;
3851 return PyUnicode_FromUnicode(&r, 1);
3852 }
Tim Petersced69f82003-09-16 20:30:58 +00003853
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 v = _PyUnicode_New(size);
3855 if (v == NULL)
3856 goto onError;
3857 if (size == 0)
3858 return (PyObject *)v;
3859 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 e = s + size;
3861 while (s < e) {
3862 register unsigned char c = (unsigned char)*s;
3863 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 ++s;
3866 }
3867 else {
3868 startinpos = s-starts;
3869 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003870 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 if (unicode_decode_call_errorhandler(
3872 errors, &errorHandler,
3873 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003874 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003879 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003880 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003881 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 Py_XDECREF(errorHandler);
3883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 onError:
3887 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 Py_XDECREF(errorHandler);
3889 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 return NULL;
3891}
3892
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 const char *errors)
3896{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898}
3899
3900PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3901{
3902 if (!PyUnicode_Check(unicode)) {
3903 PyErr_BadArgument();
3904 return NULL;
3905 }
3906 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3907 PyUnicode_GET_SIZE(unicode),
3908 NULL);
3909}
3910
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003911#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003912
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003913/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003914
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003915#if SIZEOF_INT < SIZEOF_SSIZE_T
3916#define NEED_RETRY
3917#endif
3918
3919/* XXX This code is limited to "true" double-byte encodings, as
3920 a) it assumes an incomplete character consists of a single byte, and
3921 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3922 encodings, see IsDBCSLeadByteEx documentation. */
3923
3924static int is_dbcs_lead_byte(const char *s, int offset)
3925{
3926 const char *curr = s + offset;
3927
3928 if (IsDBCSLeadByte(*curr)) {
3929 const char *prev = CharPrev(s, curr);
3930 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3931 }
3932 return 0;
3933}
3934
3935/*
3936 * Decode MBCS string into unicode object. If 'final' is set, converts
3937 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3938 */
3939static int decode_mbcs(PyUnicodeObject **v,
3940 const char *s, /* MBCS string */
3941 int size, /* sizeof MBCS string */
3942 int final)
3943{
3944 Py_UNICODE *p;
3945 Py_ssize_t n = 0;
3946 int usize = 0;
3947
3948 assert(size >= 0);
3949
3950 /* Skip trailing lead-byte unless 'final' is set */
3951 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3952 --size;
3953
3954 /* First get the size of the result */
3955 if (size > 0) {
3956 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3957 if (usize == 0) {
3958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3959 return -1;
3960 }
3961 }
3962
3963 if (*v == NULL) {
3964 /* Create unicode object */
3965 *v = _PyUnicode_New(usize);
3966 if (*v == NULL)
3967 return -1;
3968 }
3969 else {
3970 /* Extend unicode object */
3971 n = PyUnicode_GET_SIZE(*v);
3972 if (_PyUnicode_Resize(v, n + usize) < 0)
3973 return -1;
3974 }
3975
3976 /* Do the conversion */
3977 if (size > 0) {
3978 p = PyUnicode_AS_UNICODE(*v) + n;
3979 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3981 return -1;
3982 }
3983 }
3984
3985 return size;
3986}
3987
3988PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3989 Py_ssize_t size,
3990 const char *errors,
3991 Py_ssize_t *consumed)
3992{
3993 PyUnicodeObject *v = NULL;
3994 int done;
3995
3996 if (consumed)
3997 *consumed = 0;
3998
3999#ifdef NEED_RETRY
4000 retry:
4001 if (size > INT_MAX)
4002 done = decode_mbcs(&v, s, INT_MAX, 0);
4003 else
4004#endif
4005 done = decode_mbcs(&v, s, (int)size, !consumed);
4006
4007 if (done < 0) {
4008 Py_XDECREF(v);
4009 return NULL;
4010 }
4011
4012 if (consumed)
4013 *consumed += done;
4014
4015#ifdef NEED_RETRY
4016 if (size > INT_MAX) {
4017 s += done;
4018 size -= done;
4019 goto retry;
4020 }
4021#endif
4022
4023 return (PyObject *)v;
4024}
4025
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004026PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004028 const char *errors)
4029{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004030 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4031}
4032
4033/*
4034 * Convert unicode into string object (MBCS).
4035 * Returns 0 if succeed, -1 otherwise.
4036 */
4037static int encode_mbcs(PyObject **repr,
4038 const Py_UNICODE *p, /* unicode */
4039 int size) /* size of unicode */
4040{
4041 int mbcssize = 0;
4042 Py_ssize_t n = 0;
4043
4044 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004045
4046 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004047 if (size > 0) {
4048 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4049 if (mbcssize == 0) {
4050 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4051 return -1;
4052 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004053 }
4054
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004055 if (*repr == NULL) {
4056 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004057 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004058 if (*repr == NULL)
4059 return -1;
4060 }
4061 else {
4062 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004063 n = PyBytes_Size(*repr);
4064 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004065 return -1;
4066 }
4067
4068 /* Do the conversion */
4069 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004070 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004071 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4072 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4073 return -1;
4074 }
4075 }
4076
4077 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078}
4079
4080PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004081 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004082 const char *errors)
4083{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004084 PyObject *repr = NULL;
4085 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004086
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004087#ifdef NEED_RETRY
4088 retry:
4089 if (size > INT_MAX)
4090 ret = encode_mbcs(&repr, p, INT_MAX);
4091 else
4092#endif
4093 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004094
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004095 if (ret < 0) {
4096 Py_XDECREF(repr);
4097 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004098 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004099
4100#ifdef NEED_RETRY
4101 if (size > INT_MAX) {
4102 p += INT_MAX;
4103 size -= INT_MAX;
4104 goto retry;
4105 }
4106#endif
4107
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004108 return repr;
4109}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004110
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004111PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4112{
4113 if (!PyUnicode_Check(unicode)) {
4114 PyErr_BadArgument();
4115 return NULL;
4116 }
4117 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4118 PyUnicode_GET_SIZE(unicode),
4119 NULL);
4120}
4121
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004122#undef NEED_RETRY
4123
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004124#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004125
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126/* --- Character Mapping Codec -------------------------------------------- */
4127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004129 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 PyObject *mapping,
4131 const char *errors)
4132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004134 Py_ssize_t startinpos;
4135 Py_ssize_t endinpos;
4136 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 PyUnicodeObject *v;
4139 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004140 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 PyObject *errorHandler = NULL;
4142 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004143 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004144 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004145
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 /* Default to Latin-1 */
4147 if (mapping == NULL)
4148 return PyUnicode_DecodeLatin1(s, size, errors);
4149
4150 v = _PyUnicode_New(size);
4151 if (v == NULL)
4152 goto onError;
4153 if (size == 0)
4154 return (PyObject *)v;
4155 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004157 if (PyUnicode_CheckExact(mapping)) {
4158 mapstring = PyUnicode_AS_UNICODE(mapping);
4159 maplen = PyUnicode_GET_SIZE(mapping);
4160 while (s < e) {
4161 unsigned char ch = *s;
4162 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004164 if (ch < maplen)
4165 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004167 if (x == 0xfffe) {
4168 /* undefined mapping */
4169 outpos = p-PyUnicode_AS_UNICODE(v);
4170 startinpos = s-starts;
4171 endinpos = startinpos+1;
4172 if (unicode_decode_call_errorhandler(
4173 errors, &errorHandler,
4174 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004176 (PyObject **)&v, &outpos, &p)) {
4177 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004178 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004179 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004180 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004181 *p++ = x;
4182 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004184 }
4185 else {
4186 while (s < e) {
4187 unsigned char ch = *s;
4188 PyObject *w, *x;
4189
4190 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004191 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004192 if (w == NULL)
4193 goto onError;
4194 x = PyObject_GetItem(mapping, w);
4195 Py_DECREF(w);
4196 if (x == NULL) {
4197 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4198 /* No mapping found means: mapping is undefined. */
4199 PyErr_Clear();
4200 x = Py_None;
4201 Py_INCREF(x);
4202 } else
4203 goto onError;
4204 }
4205
4206 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004207 if (PyLong_Check(x)) {
4208 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004209 if (value < 0 || value > 65535) {
4210 PyErr_SetString(PyExc_TypeError,
4211 "character mapping must be in range(65536)");
4212 Py_DECREF(x);
4213 goto onError;
4214 }
4215 *p++ = (Py_UNICODE)value;
4216 }
4217 else if (x == Py_None) {
4218 /* undefined mapping */
4219 outpos = p-PyUnicode_AS_UNICODE(v);
4220 startinpos = s-starts;
4221 endinpos = startinpos+1;
4222 if (unicode_decode_call_errorhandler(
4223 errors, &errorHandler,
4224 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004225 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004226 (PyObject **)&v, &outpos, &p)) {
4227 Py_DECREF(x);
4228 goto onError;
4229 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004230 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 continue;
4232 }
4233 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004234 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004235
4236 if (targetsize == 1)
4237 /* 1-1 mapping */
4238 *p++ = *PyUnicode_AS_UNICODE(x);
4239
4240 else if (targetsize > 1) {
4241 /* 1-n mapping */
4242 if (targetsize > extrachars) {
4243 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004244 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4245 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004246 (targetsize << 2);
4247 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004248 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004249 if (_PyUnicode_Resize(&v,
4250 PyUnicode_GET_SIZE(v) + needed) < 0) {
4251 Py_DECREF(x);
4252 goto onError;
4253 }
4254 p = PyUnicode_AS_UNICODE(v) + oldpos;
4255 }
4256 Py_UNICODE_COPY(p,
4257 PyUnicode_AS_UNICODE(x),
4258 targetsize);
4259 p += targetsize;
4260 extrachars -= targetsize;
4261 }
4262 /* 1-0 mapping: skip the character */
4263 }
4264 else {
4265 /* wrong return value */
4266 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004267 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004268 Py_DECREF(x);
4269 goto onError;
4270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004272 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 }
4275 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004276 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 Py_XDECREF(errorHandler);
4279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004281
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 Py_XDECREF(errorHandler);
4284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 Py_XDECREF(v);
4286 return NULL;
4287}
4288
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004289/* Charmap encoding: the lookup table */
4290
4291struct encoding_map{
4292 PyObject_HEAD
4293 unsigned char level1[32];
4294 int count2, count3;
4295 unsigned char level23[1];
4296};
4297
4298static PyObject*
4299encoding_map_size(PyObject *obj, PyObject* args)
4300{
4301 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004302 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004303 128*map->count3);
4304}
4305
4306static PyMethodDef encoding_map_methods[] = {
4307 {"size", encoding_map_size, METH_NOARGS,
4308 PyDoc_STR("Return the size (in bytes) of this object") },
4309 { 0 }
4310};
4311
4312static void
4313encoding_map_dealloc(PyObject* o)
4314{
4315 PyObject_FREE(o);
4316}
4317
4318static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004319 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004320 "EncodingMap", /*tp_name*/
4321 sizeof(struct encoding_map), /*tp_basicsize*/
4322 0, /*tp_itemsize*/
4323 /* methods */
4324 encoding_map_dealloc, /*tp_dealloc*/
4325 0, /*tp_print*/
4326 0, /*tp_getattr*/
4327 0, /*tp_setattr*/
4328 0, /*tp_compare*/
4329 0, /*tp_repr*/
4330 0, /*tp_as_number*/
4331 0, /*tp_as_sequence*/
4332 0, /*tp_as_mapping*/
4333 0, /*tp_hash*/
4334 0, /*tp_call*/
4335 0, /*tp_str*/
4336 0, /*tp_getattro*/
4337 0, /*tp_setattro*/
4338 0, /*tp_as_buffer*/
4339 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4340 0, /*tp_doc*/
4341 0, /*tp_traverse*/
4342 0, /*tp_clear*/
4343 0, /*tp_richcompare*/
4344 0, /*tp_weaklistoffset*/
4345 0, /*tp_iter*/
4346 0, /*tp_iternext*/
4347 encoding_map_methods, /*tp_methods*/
4348 0, /*tp_members*/
4349 0, /*tp_getset*/
4350 0, /*tp_base*/
4351 0, /*tp_dict*/
4352 0, /*tp_descr_get*/
4353 0, /*tp_descr_set*/
4354 0, /*tp_dictoffset*/
4355 0, /*tp_init*/
4356 0, /*tp_alloc*/
4357 0, /*tp_new*/
4358 0, /*tp_free*/
4359 0, /*tp_is_gc*/
4360};
4361
4362PyObject*
4363PyUnicode_BuildEncodingMap(PyObject* string)
4364{
4365 Py_UNICODE *decode;
4366 PyObject *result;
4367 struct encoding_map *mresult;
4368 int i;
4369 int need_dict = 0;
4370 unsigned char level1[32];
4371 unsigned char level2[512];
4372 unsigned char *mlevel1, *mlevel2, *mlevel3;
4373 int count2 = 0, count3 = 0;
4374
4375 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4376 PyErr_BadArgument();
4377 return NULL;
4378 }
4379 decode = PyUnicode_AS_UNICODE(string);
4380 memset(level1, 0xFF, sizeof level1);
4381 memset(level2, 0xFF, sizeof level2);
4382
4383 /* If there isn't a one-to-one mapping of NULL to \0,
4384 or if there are non-BMP characters, we need to use
4385 a mapping dictionary. */
4386 if (decode[0] != 0)
4387 need_dict = 1;
4388 for (i = 1; i < 256; i++) {
4389 int l1, l2;
4390 if (decode[i] == 0
4391 #ifdef Py_UNICODE_WIDE
4392 || decode[i] > 0xFFFF
4393 #endif
4394 ) {
4395 need_dict = 1;
4396 break;
4397 }
4398 if (decode[i] == 0xFFFE)
4399 /* unmapped character */
4400 continue;
4401 l1 = decode[i] >> 11;
4402 l2 = decode[i] >> 7;
4403 if (level1[l1] == 0xFF)
4404 level1[l1] = count2++;
4405 if (level2[l2] == 0xFF)
4406 level2[l2] = count3++;
4407 }
4408
4409 if (count2 >= 0xFF || count3 >= 0xFF)
4410 need_dict = 1;
4411
4412 if (need_dict) {
4413 PyObject *result = PyDict_New();
4414 PyObject *key, *value;
4415 if (!result)
4416 return NULL;
4417 for (i = 0; i < 256; i++) {
4418 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004419 key = PyLong_FromLong(decode[i]);
4420 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004421 if (!key || !value)
4422 goto failed1;
4423 if (PyDict_SetItem(result, key, value) == -1)
4424 goto failed1;
4425 Py_DECREF(key);
4426 Py_DECREF(value);
4427 }
4428 return result;
4429 failed1:
4430 Py_XDECREF(key);
4431 Py_XDECREF(value);
4432 Py_DECREF(result);
4433 return NULL;
4434 }
4435
4436 /* Create a three-level trie */
4437 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4438 16*count2 + 128*count3 - 1);
4439 if (!result)
4440 return PyErr_NoMemory();
4441 PyObject_Init(result, &EncodingMapType);
4442 mresult = (struct encoding_map*)result;
4443 mresult->count2 = count2;
4444 mresult->count3 = count3;
4445 mlevel1 = mresult->level1;
4446 mlevel2 = mresult->level23;
4447 mlevel3 = mresult->level23 + 16*count2;
4448 memcpy(mlevel1, level1, 32);
4449 memset(mlevel2, 0xFF, 16*count2);
4450 memset(mlevel3, 0, 128*count3);
4451 count3 = 0;
4452 for (i = 1; i < 256; i++) {
4453 int o1, o2, o3, i2, i3;
4454 if (decode[i] == 0xFFFE)
4455 /* unmapped character */
4456 continue;
4457 o1 = decode[i]>>11;
4458 o2 = (decode[i]>>7) & 0xF;
4459 i2 = 16*mlevel1[o1] + o2;
4460 if (mlevel2[i2] == 0xFF)
4461 mlevel2[i2] = count3++;
4462 o3 = decode[i] & 0x7F;
4463 i3 = 128*mlevel2[i2] + o3;
4464 mlevel3[i3] = i;
4465 }
4466 return result;
4467}
4468
4469static int
4470encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4471{
4472 struct encoding_map *map = (struct encoding_map*)mapping;
4473 int l1 = c>>11;
4474 int l2 = (c>>7) & 0xF;
4475 int l3 = c & 0x7F;
4476 int i;
4477
4478#ifdef Py_UNICODE_WIDE
4479 if (c > 0xFFFF) {
4480 return -1;
4481 }
4482#endif
4483 if (c == 0)
4484 return 0;
4485 /* level 1*/
4486 i = map->level1[l1];
4487 if (i == 0xFF) {
4488 return -1;
4489 }
4490 /* level 2*/
4491 i = map->level23[16*i+l2];
4492 if (i == 0xFF) {
4493 return -1;
4494 }
4495 /* level 3 */
4496 i = map->level23[16*map->count2 + 128*i + l3];
4497 if (i == 0) {
4498 return -1;
4499 }
4500 return i;
4501}
4502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503/* Lookup the character ch in the mapping. If the character
4504 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004505 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507{
Christian Heimes217cfd12007-12-02 14:31:20 +00004508 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 PyObject *x;
4510
4511 if (w == NULL)
4512 return NULL;
4513 x = PyObject_GetItem(mapping, w);
4514 Py_DECREF(w);
4515 if (x == NULL) {
4516 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4517 /* No mapping found means: mapping is undefined. */
4518 PyErr_Clear();
4519 x = Py_None;
4520 Py_INCREF(x);
4521 return x;
4522 } else
4523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004525 else if (x == Py_None)
4526 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004527 else if (PyLong_Check(x)) {
4528 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 if (value < 0 || value > 255) {
4530 PyErr_SetString(PyExc_TypeError,
4531 "character mapping must be in range(256)");
4532 Py_DECREF(x);
4533 return NULL;
4534 }
4535 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004537 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004541 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004542 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004543 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 Py_DECREF(x);
4545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 }
4547}
4548
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004549static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004550charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551{
Christian Heimes72b710a2008-05-26 13:28:38 +00004552 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004553 /* exponentially overallocate to minimize reallocations */
4554 if (requiredsize < 2*outsize)
4555 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004556 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004557 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004558 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004559}
4560
4561typedef enum charmapencode_result {
4562 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4563}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004565 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 space is available. Return a new reference to the object that
4567 was put in the output buffer, or Py_None, if the mapping was undefined
4568 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004569 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004572 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004574 PyObject *rep;
4575 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004576 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577
Christian Heimes90aa7642007-12-19 02:45:37 +00004578 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004579 int res = encoding_map_lookup(c, mapping);
4580 Py_ssize_t requiredsize = *outpos+1;
4581 if (res == -1)
4582 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004583 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004584 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004585 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004586 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004587 outstart[(*outpos)++] = (char)res;
4588 return enc_SUCCESS;
4589 }
4590
4591 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004593 return enc_EXCEPTION;
4594 else if (rep==Py_None) {
4595 Py_DECREF(rep);
4596 return enc_FAILED;
4597 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004598 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004599 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004600 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004601 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004603 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004605 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004606 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 }
4608 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004609 const char *repchars = PyBytes_AS_STRING(rep);
4610 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004611 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004612 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004613 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004617 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 memcpy(outstart + *outpos, repchars, repsize);
4619 *outpos += repsize;
4620 }
4621 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004622 Py_DECREF(rep);
4623 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624}
4625
4626/* handle an error in PyUnicode_EncodeCharmap
4627 Return 0 on success, -1 on error */
4628static
4629int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004630 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004632 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004633 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634{
4635 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 Py_ssize_t repsize;
4637 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 Py_UNICODE *uni2;
4639 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 Py_ssize_t collstartpos = *inpos;
4641 Py_ssize_t collendpos = *inpos+1;
4642 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 char *encoding = "charmap";
4644 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004645 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 /* find all unencodable characters */
4648 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004649 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004650 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004651 int res = encoding_map_lookup(p[collendpos], mapping);
4652 if (res != -1)
4653 break;
4654 ++collendpos;
4655 continue;
4656 }
4657
4658 rep = charmapencode_lookup(p[collendpos], mapping);
4659 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004661 else if (rep!=Py_None) {
4662 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 break;
4664 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004665 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 ++collendpos;
4667 }
4668 /* cache callback name lookup
4669 * (if not done yet, i.e. it's the first error) */
4670 if (*known_errorHandler==-1) {
4671 if ((errors==NULL) || (!strcmp(errors, "strict")))
4672 *known_errorHandler = 1;
4673 else if (!strcmp(errors, "replace"))
4674 *known_errorHandler = 2;
4675 else if (!strcmp(errors, "ignore"))
4676 *known_errorHandler = 3;
4677 else if (!strcmp(errors, "xmlcharrefreplace"))
4678 *known_errorHandler = 4;
4679 else
4680 *known_errorHandler = 0;
4681 }
4682 switch (*known_errorHandler) {
4683 case 1: /* strict */
4684 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4685 return -1;
4686 case 2: /* replace */
4687 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4688 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004689 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 return -1;
4691 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004692 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4694 return -1;
4695 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 }
4697 /* fall through */
4698 case 3: /* ignore */
4699 *inpos = collendpos;
4700 break;
4701 case 4: /* xmlcharrefreplace */
4702 /* generate replacement (temporarily (mis)uses p) */
4703 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4704 char buffer[2+29+1+1];
4705 char *cp;
4706 sprintf(buffer, "&#%d;", (int)p[collpos]);
4707 for (cp = buffer; *cp; ++cp) {
4708 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004709 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004711 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4713 return -1;
4714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 }
4716 }
4717 *inpos = collendpos;
4718 break;
4719 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004720 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 encoding, reason, p, size, exceptionObject,
4722 collstartpos, collendpos, &newpos);
4723 if (repunicode == NULL)
4724 return -1;
4725 /* generate replacement */
4726 repsize = PyUnicode_GET_SIZE(repunicode);
4727 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4728 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004729 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 return -1;
4731 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004732 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4735 return -1;
4736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 }
4738 *inpos = newpos;
4739 Py_DECREF(repunicode);
4740 }
4741 return 0;
4742}
4743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 PyObject *mapping,
4747 const char *errors)
4748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 /* output object */
4750 PyObject *res = NULL;
4751 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 PyObject *errorHandler = NULL;
4756 PyObject *exc = NULL;
4757 /* the following variable is used for caching string comparisons
4758 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4759 * 3=ignore, 4=xmlcharrefreplace */
4760 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761
4762 /* Default to Latin-1 */
4763 if (mapping == NULL)
4764 return PyUnicode_EncodeLatin1(p, size, errors);
4765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 /* allocate enough for a simple encoding without
4767 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004768 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 if (res == NULL)
4770 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004771 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 while (inpos<size) {
4775 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004776 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004777 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004779 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 if (charmap_encoding_error(p, size, &inpos, mapping,
4781 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004782 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004783 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004784 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 else
4788 /* done with this character => adjust input position */
4789 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004793 if (respos<PyBytes_GET_SIZE(res))
4794 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 Py_XDECREF(exc);
4797 Py_XDECREF(errorHandler);
4798 return res;
4799
4800 onError:
4801 Py_XDECREF(res);
4802 Py_XDECREF(exc);
4803 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 return NULL;
4805}
4806
4807PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4808 PyObject *mapping)
4809{
4810 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4811 PyErr_BadArgument();
4812 return NULL;
4813 }
4814 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4815 PyUnicode_GET_SIZE(unicode),
4816 mapping,
4817 NULL);
4818}
4819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820/* create or adjust a UnicodeTranslateError */
4821static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 const Py_UNICODE *unicode, Py_ssize_t size,
4823 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 if (*exceptionObject == NULL) {
4827 *exceptionObject = PyUnicodeTranslateError_Create(
4828 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
4830 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4832 goto onError;
4833 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4834 goto onError;
4835 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4836 goto onError;
4837 return;
4838 onError:
4839 Py_DECREF(*exceptionObject);
4840 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
4842}
4843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844/* raises a UnicodeTranslateError */
4845static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004846 const Py_UNICODE *unicode, Py_ssize_t size,
4847 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 const char *reason)
4849{
4850 make_translate_exception(exceptionObject,
4851 unicode, size, startpos, endpos, reason);
4852 if (*exceptionObject != NULL)
4853 PyCodec_StrictErrors(*exceptionObject);
4854}
4855
4856/* error handling callback helper:
4857 build arguments, call the callback and check the arguments,
4858 put the result into newpos and return the replacement string, which
4859 has to be freed by the caller */
4860static PyObject *unicode_translate_call_errorhandler(const char *errors,
4861 PyObject **errorHandler,
4862 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4864 Py_ssize_t startpos, Py_ssize_t endpos,
4865 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004867 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004869 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 PyObject *restuple;
4871 PyObject *resunicode;
4872
4873 if (*errorHandler == NULL) {
4874 *errorHandler = PyCodec_LookupError(errors);
4875 if (*errorHandler == NULL)
4876 return NULL;
4877 }
4878
4879 make_translate_exception(exceptionObject,
4880 unicode, size, startpos, endpos, reason);
4881 if (*exceptionObject == NULL)
4882 return NULL;
4883
4884 restuple = PyObject_CallFunctionObjArgs(
4885 *errorHandler, *exceptionObject, NULL);
4886 if (restuple == NULL)
4887 return NULL;
4888 if (!PyTuple_Check(restuple)) {
4889 PyErr_Format(PyExc_TypeError, &argparse[4]);
4890 Py_DECREF(restuple);
4891 return NULL;
4892 }
4893 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 Py_DECREF(restuple);
4896 return NULL;
4897 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 if (i_newpos<0)
4899 *newpos = size+i_newpos;
4900 else
4901 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004902 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004903 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004904 Py_DECREF(restuple);
4905 return NULL;
4906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 Py_INCREF(resunicode);
4908 Py_DECREF(restuple);
4909 return resunicode;
4910}
4911
4912/* Lookup the character ch in the mapping and put the result in result,
4913 which must be decrefed by the caller.
4914 Return 0 on success, -1 on error */
4915static
4916int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4917{
Christian Heimes217cfd12007-12-02 14:31:20 +00004918 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 PyObject *x;
4920
4921 if (w == NULL)
4922 return -1;
4923 x = PyObject_GetItem(mapping, w);
4924 Py_DECREF(w);
4925 if (x == NULL) {
4926 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4927 /* No mapping found means: use 1:1 mapping. */
4928 PyErr_Clear();
4929 *result = NULL;
4930 return 0;
4931 } else
4932 return -1;
4933 }
4934 else if (x == Py_None) {
4935 *result = x;
4936 return 0;
4937 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004938 else if (PyLong_Check(x)) {
4939 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 long max = PyUnicode_GetMax();
4941 if (value < 0 || value > max) {
4942 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004943 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 Py_DECREF(x);
4945 return -1;
4946 }
4947 *result = x;
4948 return 0;
4949 }
4950 else if (PyUnicode_Check(x)) {
4951 *result = x;
4952 return 0;
4953 }
4954 else {
4955 /* wrong return value */
4956 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004957 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00004958 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 return -1;
4960 }
4961}
4962/* ensure that *outobj is at least requiredsize characters long,
4963if not reallocate and adjust various state variables.
4964Return 0 on success, -1 on error */
4965static
Walter Dörwald4894c302003-10-24 14:25:28 +00004966int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004967 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004970 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004974 if (requiredsize < 2 * oldsize)
4975 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004976 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 return -1;
4978 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 }
4980 return 0;
4981}
4982/* lookup the character, put the result in the output string and adjust
4983 various state variables. Return a new reference to the object that
4984 was put in the output buffer in *result, or Py_None, if the mapping was
4985 undefined (in which case no character was written).
4986 The called must decref result.
4987 Return 0 on success, -1 on error. */
4988static
Walter Dörwald4894c302003-10-24 14:25:28 +00004989int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992{
Walter Dörwald4894c302003-10-24 14:25:28 +00004993 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 return -1;
4995 if (*res==NULL) {
4996 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004997 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 }
4999 else if (*res==Py_None)
5000 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005001 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005003 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 }
5005 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 if (repsize==1) {
5008 /* no overflow check, because we know that the space is enough */
5009 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5010 }
5011 else if (repsize!=0) {
5012 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005013 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005014 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005015 repsize - 1;
5016 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 return -1;
5018 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5019 *outp += repsize;
5020 }
5021 }
5022 else
5023 return -1;
5024 return 0;
5025}
5026
5027PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005028 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 PyObject *mapping,
5030 const char *errors)
5031{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 /* output object */
5033 PyObject *res = NULL;
5034 /* pointers to the beginning and end+1 of input */
5035 const Py_UNICODE *startp = p;
5036 const Py_UNICODE *endp = p + size;
5037 /* pointer into the output */
5038 Py_UNICODE *str;
5039 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 char *reason = "character maps to <undefined>";
5042 PyObject *errorHandler = NULL;
5043 PyObject *exc = NULL;
5044 /* the following variable is used for caching string comparisons
5045 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5046 * 3=ignore, 4=xmlcharrefreplace */
5047 int known_errorHandler = -1;
5048
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 if (mapping == NULL) {
5050 PyErr_BadArgument();
5051 return NULL;
5052 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053
5054 /* allocate enough for a simple 1:1 translation without
5055 replacements, if we need more, we'll resize */
5056 res = PyUnicode_FromUnicode(NULL, size);
5057 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005058 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 return res;
5061 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 while (p<endp) {
5064 /* try to encode it */
5065 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005066 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 goto onError;
5069 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005070 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 if (x!=Py_None) /* it worked => adjust input pointer */
5072 ++p;
5073 else { /* untranslatable character */
5074 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005075 Py_ssize_t repsize;
5076 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 Py_UNICODE *uni2;
5078 /* startpos for collecting untranslatable chars */
5079 const Py_UNICODE *collstart = p;
5080 const Py_UNICODE *collend = p+1;
5081 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 /* find all untranslatable characters */
5084 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005085 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 goto onError;
5087 Py_XDECREF(x);
5088 if (x!=Py_None)
5089 break;
5090 ++collend;
5091 }
5092 /* cache callback name lookup
5093 * (if not done yet, i.e. it's the first error) */
5094 if (known_errorHandler==-1) {
5095 if ((errors==NULL) || (!strcmp(errors, "strict")))
5096 known_errorHandler = 1;
5097 else if (!strcmp(errors, "replace"))
5098 known_errorHandler = 2;
5099 else if (!strcmp(errors, "ignore"))
5100 known_errorHandler = 3;
5101 else if (!strcmp(errors, "xmlcharrefreplace"))
5102 known_errorHandler = 4;
5103 else
5104 known_errorHandler = 0;
5105 }
5106 switch (known_errorHandler) {
5107 case 1: /* strict */
5108 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5109 goto onError;
5110 case 2: /* replace */
5111 /* No need to check for space, this is a 1:1 replacement */
5112 for (coll = collstart; coll<collend; ++coll)
5113 *str++ = '?';
5114 /* fall through */
5115 case 3: /* ignore */
5116 p = collend;
5117 break;
5118 case 4: /* xmlcharrefreplace */
5119 /* generate replacement (temporarily (mis)uses p) */
5120 for (p = collstart; p < collend; ++p) {
5121 char buffer[2+29+1+1];
5122 char *cp;
5123 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005124 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5126 goto onError;
5127 for (cp = buffer; *cp; ++cp)
5128 *str++ = *cp;
5129 }
5130 p = collend;
5131 break;
5132 default:
5133 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5134 reason, startp, size, &exc,
5135 collstart-startp, collend-startp, &newpos);
5136 if (repunicode == NULL)
5137 goto onError;
5138 /* generate replacement */
5139 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005140 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5142 Py_DECREF(repunicode);
5143 goto onError;
5144 }
5145 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5146 *str++ = *uni2;
5147 p = startp + newpos;
5148 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 }
5150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 /* Resize if we allocated to much */
5153 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005154 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005155 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005156 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157 }
5158 Py_XDECREF(exc);
5159 Py_XDECREF(errorHandler);
5160 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 onError:
5163 Py_XDECREF(res);
5164 Py_XDECREF(exc);
5165 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return NULL;
5167}
5168
5169PyObject *PyUnicode_Translate(PyObject *str,
5170 PyObject *mapping,
5171 const char *errors)
5172{
5173 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 str = PyUnicode_FromObject(str);
5176 if (str == NULL)
5177 goto onError;
5178 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5179 PyUnicode_GET_SIZE(str),
5180 mapping,
5181 errors);
5182 Py_DECREF(str);
5183 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 onError:
5186 Py_XDECREF(str);
5187 return NULL;
5188}
Tim Petersced69f82003-09-16 20:30:58 +00005189
Guido van Rossum9e896b32000-04-05 20:11:21 +00005190/* --- Decimal Encoder ---------------------------------------------------- */
5191
5192int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005194 char *output,
5195 const char *errors)
5196{
5197 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 PyObject *errorHandler = NULL;
5199 PyObject *exc = NULL;
5200 const char *encoding = "decimal";
5201 const char *reason = "invalid decimal Unicode string";
5202 /* the following variable is used for caching string comparisons
5203 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5204 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005205
5206 if (output == NULL) {
5207 PyErr_BadArgument();
5208 return -1;
5209 }
5210
5211 p = s;
5212 end = s + length;
5213 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005215 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005217 Py_ssize_t repsize;
5218 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 Py_UNICODE *uni2;
5220 Py_UNICODE *collstart;
5221 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005222
Guido van Rossum9e896b32000-04-05 20:11:21 +00005223 if (Py_UNICODE_ISSPACE(ch)) {
5224 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005226 continue;
5227 }
5228 decimal = Py_UNICODE_TODECIMAL(ch);
5229 if (decimal >= 0) {
5230 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005232 continue;
5233 }
Guido van Rossumba477042000-04-06 18:18:10 +00005234 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005235 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005237 continue;
5238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 /* All other characters are considered unencodable */
5240 collstart = p;
5241 collend = p+1;
5242 while (collend < end) {
5243 if ((0 < *collend && *collend < 256) ||
5244 !Py_UNICODE_ISSPACE(*collend) ||
5245 Py_UNICODE_TODECIMAL(*collend))
5246 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005247 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 /* cache callback name lookup
5249 * (if not done yet, i.e. it's the first error) */
5250 if (known_errorHandler==-1) {
5251 if ((errors==NULL) || (!strcmp(errors, "strict")))
5252 known_errorHandler = 1;
5253 else if (!strcmp(errors, "replace"))
5254 known_errorHandler = 2;
5255 else if (!strcmp(errors, "ignore"))
5256 known_errorHandler = 3;
5257 else if (!strcmp(errors, "xmlcharrefreplace"))
5258 known_errorHandler = 4;
5259 else
5260 known_errorHandler = 0;
5261 }
5262 switch (known_errorHandler) {
5263 case 1: /* strict */
5264 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5265 goto onError;
5266 case 2: /* replace */
5267 for (p = collstart; p < collend; ++p)
5268 *output++ = '?';
5269 /* fall through */
5270 case 3: /* ignore */
5271 p = collend;
5272 break;
5273 case 4: /* xmlcharrefreplace */
5274 /* generate replacement (temporarily (mis)uses p) */
5275 for (p = collstart; p < collend; ++p)
5276 output += sprintf(output, "&#%d;", (int)*p);
5277 p = collend;
5278 break;
5279 default:
5280 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5281 encoding, reason, s, length, &exc,
5282 collstart-s, collend-s, &newpos);
5283 if (repunicode == NULL)
5284 goto onError;
5285 /* generate replacement */
5286 repsize = PyUnicode_GET_SIZE(repunicode);
5287 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5288 Py_UNICODE ch = *uni2;
5289 if (Py_UNICODE_ISSPACE(ch))
5290 *output++ = ' ';
5291 else {
5292 decimal = Py_UNICODE_TODECIMAL(ch);
5293 if (decimal >= 0)
5294 *output++ = '0' + decimal;
5295 else if (0 < ch && ch < 256)
5296 *output++ = (char)ch;
5297 else {
5298 Py_DECREF(repunicode);
5299 raise_encode_exception(&exc, encoding,
5300 s, length, collstart-s, collend-s, reason);
5301 goto onError;
5302 }
5303 }
5304 }
5305 p = s + newpos;
5306 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005307 }
5308 }
5309 /* 0-terminate the output string */
5310 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 Py_XDECREF(exc);
5312 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005313 return 0;
5314
5315 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005316 Py_XDECREF(exc);
5317 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005318 return -1;
5319}
5320
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321/* --- Helpers ------------------------------------------------------------ */
5322
Eric Smith8c663262007-08-25 02:26:07 +00005323#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005324#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005325#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005326/* Include _ParseTupleFinds from find.h */
5327#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005328#include "stringlib/find.h"
5329#include "stringlib/partition.h"
5330
Eric Smith5807c412008-05-11 21:00:57 +00005331#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5332#include "stringlib/localeutil.h"
5333
Thomas Wouters477c8d52006-05-27 19:21:47 +00005334/* helper macro to fixup start/end slice values */
5335#define FIX_START_END(obj) \
5336 if (start < 0) \
5337 start += (obj)->length; \
5338 if (start < 0) \
5339 start = 0; \
5340 if (end > (obj)->length) \
5341 end = (obj)->length; \
5342 if (end < 0) \
5343 end += (obj)->length; \
5344 if (end < 0) \
5345 end = 0;
5346
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005348 PyObject *substr,
5349 Py_ssize_t start,
5350 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005353 PyUnicodeObject* str_obj;
5354 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005355
Thomas Wouters477c8d52006-05-27 19:21:47 +00005356 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5357 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005359 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5360 if (!sub_obj) {
5361 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 return -1;
5363 }
Tim Petersced69f82003-09-16 20:30:58 +00005364
Thomas Wouters477c8d52006-05-27 19:21:47 +00005365 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005366
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367 result = stringlib_count(
5368 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5369 );
5370
5371 Py_DECREF(sub_obj);
5372 Py_DECREF(str_obj);
5373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 return result;
5375}
5376
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005378 PyObject *sub,
5379 Py_ssize_t start,
5380 Py_ssize_t end,
5381 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005384
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005386 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005387 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005388 sub = PyUnicode_FromObject(sub);
5389 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005390 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005391 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 }
Tim Petersced69f82003-09-16 20:30:58 +00005393
Thomas Wouters477c8d52006-05-27 19:21:47 +00005394 if (direction > 0)
5395 result = stringlib_find_slice(
5396 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5397 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5398 start, end
5399 );
5400 else
5401 result = stringlib_rfind_slice(
5402 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5403 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5404 start, end
5405 );
5406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 Py_DECREF(sub);
5409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 return result;
5411}
5412
Tim Petersced69f82003-09-16 20:30:58 +00005413static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414int tailmatch(PyUnicodeObject *self,
5415 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416 Py_ssize_t start,
5417 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 int direction)
5419{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 if (substring->length == 0)
5421 return 1;
5422
Thomas Wouters477c8d52006-05-27 19:21:47 +00005423 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
5425 end -= substring->length;
5426 if (end < start)
5427 return 0;
5428
5429 if (direction > 0) {
5430 if (Py_UNICODE_MATCH(self, end, substring))
5431 return 1;
5432 } else {
5433 if (Py_UNICODE_MATCH(self, start, substring))
5434 return 1;
5435 }
5436
5437 return 0;
5438}
5439
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442 Py_ssize_t start,
5443 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 int direction)
5445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005447
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 str = PyUnicode_FromObject(str);
5449 if (str == NULL)
5450 return -1;
5451 substr = PyUnicode_FromObject(substr);
5452 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005453 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 return -1;
5455 }
Tim Petersced69f82003-09-16 20:30:58 +00005456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 result = tailmatch((PyUnicodeObject *)str,
5458 (PyUnicodeObject *)substr,
5459 start, end, direction);
5460 Py_DECREF(str);
5461 Py_DECREF(substr);
5462 return result;
5463}
5464
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465/* Apply fixfct filter to the Unicode object self and return a
5466 reference to the modified object */
5467
Tim Petersced69f82003-09-16 20:30:58 +00005468static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469PyObject *fixup(PyUnicodeObject *self,
5470 int (*fixfct)(PyUnicodeObject *s))
5471{
5472
5473 PyUnicodeObject *u;
5474
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005475 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 if (u == NULL)
5477 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005478
5479 Py_UNICODE_COPY(u->str, self->str, self->length);
5480
Tim Peters7a29bd52001-09-12 03:03:31 +00005481 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 /* fixfct should return TRUE if it modified the buffer. If
5483 FALSE, return a reference to the original buffer instead
5484 (to save space, not time) */
5485 Py_INCREF(self);
5486 Py_DECREF(u);
5487 return (PyObject*) self;
5488 }
5489 return (PyObject*) u;
5490}
5491
Tim Petersced69f82003-09-16 20:30:58 +00005492static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493int fixupper(PyUnicodeObject *self)
5494{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005495 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 Py_UNICODE *s = self->str;
5497 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 while (len-- > 0) {
5500 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005501
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 ch = Py_UNICODE_TOUPPER(*s);
5503 if (ch != *s) {
5504 status = 1;
5505 *s = ch;
5506 }
5507 s++;
5508 }
5509
5510 return status;
5511}
5512
Tim Petersced69f82003-09-16 20:30:58 +00005513static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514int fixlower(PyUnicodeObject *self)
5515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005516 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 Py_UNICODE *s = self->str;
5518 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 while (len-- > 0) {
5521 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 ch = Py_UNICODE_TOLOWER(*s);
5524 if (ch != *s) {
5525 status = 1;
5526 *s = ch;
5527 }
5528 s++;
5529 }
5530
5531 return status;
5532}
5533
Tim Petersced69f82003-09-16 20:30:58 +00005534static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535int fixswapcase(PyUnicodeObject *self)
5536{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005537 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 Py_UNICODE *s = self->str;
5539 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 while (len-- > 0) {
5542 if (Py_UNICODE_ISUPPER(*s)) {
5543 *s = Py_UNICODE_TOLOWER(*s);
5544 status = 1;
5545 } else if (Py_UNICODE_ISLOWER(*s)) {
5546 *s = Py_UNICODE_TOUPPER(*s);
5547 status = 1;
5548 }
5549 s++;
5550 }
5551
5552 return status;
5553}
5554
Tim Petersced69f82003-09-16 20:30:58 +00005555static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556int fixcapitalize(PyUnicodeObject *self)
5557{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005558 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005559 Py_UNICODE *s = self->str;
5560 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005561
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005562 if (len == 0)
5563 return 0;
5564 if (Py_UNICODE_ISLOWER(*s)) {
5565 *s = Py_UNICODE_TOUPPER(*s);
5566 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005568 s++;
5569 while (--len > 0) {
5570 if (Py_UNICODE_ISUPPER(*s)) {
5571 *s = Py_UNICODE_TOLOWER(*s);
5572 status = 1;
5573 }
5574 s++;
5575 }
5576 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577}
5578
5579static
5580int fixtitle(PyUnicodeObject *self)
5581{
5582 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5583 register Py_UNICODE *e;
5584 int previous_is_cased;
5585
5586 /* Shortcut for single character strings */
5587 if (PyUnicode_GET_SIZE(self) == 1) {
5588 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5589 if (*p != ch) {
5590 *p = ch;
5591 return 1;
5592 }
5593 else
5594 return 0;
5595 }
Tim Petersced69f82003-09-16 20:30:58 +00005596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 e = p + PyUnicode_GET_SIZE(self);
5598 previous_is_cased = 0;
5599 for (; p < e; p++) {
5600 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 if (previous_is_cased)
5603 *p = Py_UNICODE_TOLOWER(ch);
5604 else
5605 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005606
5607 if (Py_UNICODE_ISLOWER(ch) ||
5608 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 Py_UNICODE_ISTITLE(ch))
5610 previous_is_cased = 1;
5611 else
5612 previous_is_cased = 0;
5613 }
5614 return 1;
5615}
5616
Tim Peters8ce9f162004-08-27 01:49:32 +00005617PyObject *
5618PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619{
Tim Peters8ce9f162004-08-27 01:49:32 +00005620 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005621 const Py_UNICODE blank = ' ';
5622 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005623 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5626 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005627 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5628 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005631 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 fseq = PySequence_Fast(seq, "");
5634 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005635 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005636 }
5637
Tim Peters91879ab2004-08-27 22:35:44 +00005638 /* Grrrr. A codec may be invoked to convert str objects to
5639 * Unicode, and so it's possible to call back into Python code
5640 * during PyUnicode_FromObject(), and so it's possible for a sick
5641 * codec to change the size of fseq (if seq is a list). Therefore
5642 * we have to keep refetching the size -- can't assume seqlen
5643 * is invariant.
5644 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005645 seqlen = PySequence_Fast_GET_SIZE(fseq);
5646 /* If empty sequence, return u"". */
5647 if (seqlen == 0) {
5648 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5649 goto Done;
5650 }
5651 /* If singleton sequence with an exact Unicode, return that. */
5652 if (seqlen == 1) {
5653 item = PySequence_Fast_GET_ITEM(fseq, 0);
5654 if (PyUnicode_CheckExact(item)) {
5655 Py_INCREF(item);
5656 res = (PyUnicodeObject *)item;
5657 goto Done;
5658 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005659 }
5660
Tim Peters05eba1f2004-08-27 21:32:02 +00005661 /* At least two items to join, or one that isn't exact Unicode. */
5662 if (seqlen > 1) {
5663 /* Set up sep and seplen -- they're needed. */
5664 if (separator == NULL) {
5665 sep = &blank;
5666 seplen = 1;
5667 }
5668 else {
5669 internal_separator = PyUnicode_FromObject(separator);
5670 if (internal_separator == NULL)
5671 goto onError;
5672 sep = PyUnicode_AS_UNICODE(internal_separator);
5673 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005674 /* In case PyUnicode_FromObject() mutated seq. */
5675 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 }
5677 }
5678
5679 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005680 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005681 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005682 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 res_p = PyUnicode_AS_UNICODE(res);
5684 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005685
Tim Peters05eba1f2004-08-27 21:32:02 +00005686 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005687 Py_ssize_t itemlen;
5688 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005689
5690 item = PySequence_Fast_GET_ITEM(fseq, i);
5691 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005692 if (!PyUnicode_Check(item)) {
5693 PyErr_Format(PyExc_TypeError,
5694 "sequence item %zd: expected str instance,"
5695 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005696 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005697 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005698 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005699 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 if (item == NULL)
5701 goto onError;
5702 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005703
Tim Peters91879ab2004-08-27 22:35:44 +00005704 /* In case PyUnicode_FromObject() mutated seq. */
5705 seqlen = PySequence_Fast_GET_SIZE(fseq);
5706
Tim Peters8ce9f162004-08-27 01:49:32 +00005707 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005709 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005710 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005711 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005712 if (i < seqlen - 1) {
5713 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005714 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005715 goto Overflow;
5716 }
5717 if (new_res_used > res_alloc) {
5718 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005719 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005720 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005721 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005722 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005723 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005724 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005725 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005727 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005728 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005730
5731 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005732 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005733 res_p += itemlen;
5734 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005735 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005736 res_p += seplen;
5737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005739 res_used = new_res_used;
5740 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005741
Tim Peters05eba1f2004-08-27 21:32:02 +00005742 /* Shrink res to match the used area; this probably can't fail,
5743 * but it's cheap to check.
5744 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005745 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005746 goto onError;
5747
5748 Done:
5749 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005750 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 return (PyObject *)res;
5752
Tim Peters8ce9f162004-08-27 01:49:32 +00005753 Overflow:
5754 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005755 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005756 Py_DECREF(item);
5757 /* fall through */
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005760 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005761 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005762 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 return NULL;
5764}
5765
Tim Petersced69f82003-09-16 20:30:58 +00005766static
5767PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 Py_ssize_t left,
5769 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 Py_UNICODE fill)
5771{
5772 PyUnicodeObject *u;
5773
5774 if (left < 0)
5775 left = 0;
5776 if (right < 0)
5777 right = 0;
5778
Tim Peters7a29bd52001-09-12 03:03:31 +00005779 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 Py_INCREF(self);
5781 return self;
5782 }
5783
5784 u = _PyUnicode_New(left + self->length + right);
5785 if (u) {
5786 if (left)
5787 Py_UNICODE_FILL(u->str, fill, left);
5788 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5789 if (right)
5790 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5791 }
5792
5793 return u;
5794}
5795
5796#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005797 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 if (!str) \
5799 goto onError; \
5800 if (PyList_Append(list, str)) { \
5801 Py_DECREF(str); \
5802 goto onError; \
5803 } \
5804 else \
5805 Py_DECREF(str);
5806
5807static
5808PyObject *split_whitespace(PyUnicodeObject *self,
5809 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005810 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 register Py_ssize_t i;
5813 register Py_ssize_t j;
5814 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005816 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818 for (i = j = 0; i < len; ) {
5819 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005820 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 i++;
5822 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005823 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 i++;
5825 if (j < i) {
5826 if (maxcount-- <= 0)
5827 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005828 SPLIT_APPEND(buf, j, i);
5829 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 i++;
5831 j = i;
5832 }
5833 }
5834 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005835 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
5837 return list;
5838
5839 onError:
5840 Py_DECREF(list);
5841 return NULL;
5842}
5843
5844PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005845 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847 register Py_ssize_t i;
5848 register Py_ssize_t j;
5849 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 PyObject *list;
5851 PyObject *str;
5852 Py_UNICODE *data;
5853
5854 string = PyUnicode_FromObject(string);
5855 if (string == NULL)
5856 return NULL;
5857 data = PyUnicode_AS_UNICODE(string);
5858 len = PyUnicode_GET_SIZE(string);
5859
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 list = PyList_New(0);
5861 if (!list)
5862 goto onError;
5863
5864 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005868 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005872 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 if (i < len) {
5874 if (data[i] == '\r' && i + 1 < len &&
5875 data[i+1] == '\n')
5876 i += 2;
5877 else
5878 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005879 if (keepends)
5880 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 }
Guido van Rossum86662912000-04-11 15:38:46 +00005882 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 j = i;
5884 }
5885 if (j < len) {
5886 SPLIT_APPEND(data, j, len);
5887 }
5888
5889 Py_DECREF(string);
5890 return list;
5891
5892 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005893 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 Py_DECREF(string);
5895 return NULL;
5896}
5897
Tim Petersced69f82003-09-16 20:30:58 +00005898static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899PyObject *split_char(PyUnicodeObject *self,
5900 PyObject *list,
5901 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005902 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005904 register Py_ssize_t i;
5905 register Py_ssize_t j;
5906 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005908 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005911 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 if (maxcount-- <= 0)
5913 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005914 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 i = j = i + 1;
5916 } else
5917 i++;
5918 }
5919 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005920 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 }
5922 return list;
5923
5924 onError:
5925 Py_DECREF(list);
5926 return NULL;
5927}
5928
Tim Petersced69f82003-09-16 20:30:58 +00005929static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930PyObject *split_substring(PyUnicodeObject *self,
5931 PyObject *list,
5932 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005933 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005935 register Py_ssize_t i;
5936 register Py_ssize_t j;
5937 Py_ssize_t len = self->length;
5938 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 PyObject *str;
5940
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005941 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 if (Py_UNICODE_MATCH(self, i, substring)) {
5943 if (maxcount-- <= 0)
5944 break;
5945 SPLIT_APPEND(self->str, j, i);
5946 i = j = i + sublen;
5947 } else
5948 i++;
5949 }
5950 if (j <= len) {
5951 SPLIT_APPEND(self->str, j, len);
5952 }
5953 return list;
5954
5955 onError:
5956 Py_DECREF(list);
5957 return NULL;
5958}
5959
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005960static
5961PyObject *rsplit_whitespace(PyUnicodeObject *self,
5962 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005963 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005964{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 register Py_ssize_t i;
5966 register Py_ssize_t j;
5967 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005968 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005969 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970
5971 for (i = j = len - 1; i >= 0; ) {
5972 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005973 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005974 i--;
5975 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005976 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005977 i--;
5978 if (j > i) {
5979 if (maxcount-- <= 0)
5980 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005981 SPLIT_APPEND(buf, i + 1, j + 1);
5982 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005983 i--;
5984 j = i;
5985 }
5986 }
5987 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005988 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005989 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 if (PyList_Reverse(list) < 0)
5991 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005992 return list;
5993
5994 onError:
5995 Py_DECREF(list);
5996 return NULL;
5997}
5998
5999static
6000PyObject *rsplit_char(PyUnicodeObject *self,
6001 PyObject *list,
6002 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006003 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 register Py_ssize_t i;
6006 register Py_ssize_t j;
6007 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006008 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006009 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010
6011 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006012 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006013 if (maxcount-- <= 0)
6014 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006015 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006016 j = i = i - 1;
6017 } else
6018 i--;
6019 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006020 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006021 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006022 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 if (PyList_Reverse(list) < 0)
6024 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006025 return list;
6026
6027 onError:
6028 Py_DECREF(list);
6029 return NULL;
6030}
6031
6032static
6033PyObject *rsplit_substring(PyUnicodeObject *self,
6034 PyObject *list,
6035 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006036 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006037{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 register Py_ssize_t i;
6039 register Py_ssize_t j;
6040 Py_ssize_t len = self->length;
6041 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006042 PyObject *str;
6043
6044 for (i = len - sublen, j = len; i >= 0; ) {
6045 if (Py_UNICODE_MATCH(self, i, substring)) {
6046 if (maxcount-- <= 0)
6047 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006048 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006049 j = i;
6050 i -= sublen;
6051 } else
6052 i--;
6053 }
6054 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006055 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006056 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006057 if (PyList_Reverse(list) < 0)
6058 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006059 return list;
6060
6061 onError:
6062 Py_DECREF(list);
6063 return NULL;
6064}
6065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066#undef SPLIT_APPEND
6067
6068static
6069PyObject *split(PyUnicodeObject *self,
6070 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006071 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
6073 PyObject *list;
6074
6075 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006076 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
6078 list = PyList_New(0);
6079 if (!list)
6080 return NULL;
6081
6082 if (substring == NULL)
6083 return split_whitespace(self,list,maxcount);
6084
6085 else if (substring->length == 1)
6086 return split_char(self,list,substring->str[0],maxcount);
6087
6088 else if (substring->length == 0) {
6089 Py_DECREF(list);
6090 PyErr_SetString(PyExc_ValueError, "empty separator");
6091 return NULL;
6092 }
6093 else
6094 return split_substring(self,list,substring,maxcount);
6095}
6096
Tim Petersced69f82003-09-16 20:30:58 +00006097static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006098PyObject *rsplit(PyUnicodeObject *self,
6099 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006100 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006101{
6102 PyObject *list;
6103
6104 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006105 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006106
6107 list = PyList_New(0);
6108 if (!list)
6109 return NULL;
6110
6111 if (substring == NULL)
6112 return rsplit_whitespace(self,list,maxcount);
6113
6114 else if (substring->length == 1)
6115 return rsplit_char(self,list,substring->str[0],maxcount);
6116
6117 else if (substring->length == 0) {
6118 Py_DECREF(list);
6119 PyErr_SetString(PyExc_ValueError, "empty separator");
6120 return NULL;
6121 }
6122 else
6123 return rsplit_substring(self,list,substring,maxcount);
6124}
6125
6126static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127PyObject *replace(PyUnicodeObject *self,
6128 PyUnicodeObject *str1,
6129 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006130 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
6132 PyUnicodeObject *u;
6133
6134 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006135 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
Thomas Wouters477c8d52006-05-27 19:21:47 +00006137 if (str1->length == str2->length) {
6138 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006139 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140 if (str1->length == 1) {
6141 /* replace characters */
6142 Py_UNICODE u1, u2;
6143 if (!findchar(self->str, self->length, str1->str[0]))
6144 goto nothing;
6145 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6146 if (!u)
6147 return NULL;
6148 Py_UNICODE_COPY(u->str, self->str, self->length);
6149 u1 = str1->str[0];
6150 u2 = str2->str[0];
6151 for (i = 0; i < u->length; i++)
6152 if (u->str[i] == u1) {
6153 if (--maxcount < 0)
6154 break;
6155 u->str[i] = u2;
6156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006158 i = fastsearch(
6159 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006161 if (i < 0)
6162 goto nothing;
6163 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6164 if (!u)
6165 return NULL;
6166 Py_UNICODE_COPY(u->str, self->str, self->length);
6167 while (i <= self->length - str1->length)
6168 if (Py_UNICODE_MATCH(self, i, str1)) {
6169 if (--maxcount < 0)
6170 break;
6171 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6172 i += str1->length;
6173 } else
6174 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006177
6178 Py_ssize_t n, i, j, e;
6179 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 Py_UNICODE *p;
6181
6182 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006183 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 if (n > maxcount)
6185 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006186 if (n == 0)
6187 goto nothing;
6188 /* new_size = self->length + n * (str2->length - str1->length)); */
6189 delta = (str2->length - str1->length);
6190 if (delta == 0) {
6191 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006193 product = n * (str2->length - str1->length);
6194 if ((product / (str2->length - str1->length)) != n) {
6195 PyErr_SetString(PyExc_OverflowError,
6196 "replace string is too long");
6197 return NULL;
6198 }
6199 new_size = self->length + product;
6200 if (new_size < 0) {
6201 PyErr_SetString(PyExc_OverflowError,
6202 "replace string is too long");
6203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 }
6205 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006206 u = _PyUnicode_New(new_size);
6207 if (!u)
6208 return NULL;
6209 i = 0;
6210 p = u->str;
6211 e = self->length - str1->length;
6212 if (str1->length > 0) {
6213 while (n-- > 0) {
6214 /* look for next match */
6215 j = i;
6216 while (j <= e) {
6217 if (Py_UNICODE_MATCH(self, j, str1))
6218 break;
6219 j++;
6220 }
6221 if (j > i) {
6222 if (j > e)
6223 break;
6224 /* copy unchanged part [i:j] */
6225 Py_UNICODE_COPY(p, self->str+i, j-i);
6226 p += j - i;
6227 }
6228 /* copy substitution string */
6229 if (str2->length > 0) {
6230 Py_UNICODE_COPY(p, str2->str, str2->length);
6231 p += str2->length;
6232 }
6233 i = j + str1->length;
6234 }
6235 if (i < self->length)
6236 /* copy tail [i:] */
6237 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6238 } else {
6239 /* interleave */
6240 while (n > 0) {
6241 Py_UNICODE_COPY(p, str2->str, str2->length);
6242 p += str2->length;
6243 if (--n <= 0)
6244 break;
6245 *p++ = self->str[i++];
6246 }
6247 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006251
6252nothing:
6253 /* nothing to replace; return original string (when possible) */
6254 if (PyUnicode_CheckExact(self)) {
6255 Py_INCREF(self);
6256 return (PyObject *) self;
6257 }
6258 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259}
6260
6261/* --- Unicode Object Methods --------------------------------------------- */
6262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006263PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006264"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265\n\
6266Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006267characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
6269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006270unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 return fixup(self, fixtitle);
6273}
6274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006275PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006276"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277\n\
6278Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006279have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
6281static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006282unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 return fixup(self, fixcapitalize);
6285}
6286
6287#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006289"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290\n\
6291Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006292normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293
6294static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006295unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296{
6297 PyObject *list;
6298 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006299 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 /* Split into words */
6302 list = split(self, NULL, -1);
6303 if (!list)
6304 return NULL;
6305
6306 /* Capitalize each word */
6307 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6308 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6309 fixcapitalize);
6310 if (item == NULL)
6311 goto onError;
6312 Py_DECREF(PyList_GET_ITEM(list, i));
6313 PyList_SET_ITEM(list, i, item);
6314 }
6315
6316 /* Join the words to form a new string */
6317 item = PyUnicode_Join(NULL, list);
6318
6319onError:
6320 Py_DECREF(list);
6321 return (PyObject *)item;
6322}
6323#endif
6324
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006325/* Argument converter. Coerces to a single unicode character */
6326
6327static int
6328convert_uc(PyObject *obj, void *addr)
6329{
6330 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6331 PyObject *uniobj;
6332 Py_UNICODE *unistr;
6333
6334 uniobj = PyUnicode_FromObject(obj);
6335 if (uniobj == NULL) {
6336 PyErr_SetString(PyExc_TypeError,
6337 "The fill character cannot be converted to Unicode");
6338 return 0;
6339 }
6340 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6341 PyErr_SetString(PyExc_TypeError,
6342 "The fill character must be exactly one character long");
6343 Py_DECREF(uniobj);
6344 return 0;
6345 }
6346 unistr = PyUnicode_AS_UNICODE(uniobj);
6347 *fillcharloc = unistr[0];
6348 Py_DECREF(uniobj);
6349 return 1;
6350}
6351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006353"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006355Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006356done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
6358static PyObject *
6359unicode_center(PyUnicodeObject *self, PyObject *args)
6360{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t marg, left;
6362 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006363 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
Thomas Woutersde017742006-02-16 19:34:37 +00006365 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 return NULL;
6367
Tim Peters7a29bd52001-09-12 03:03:31 +00006368 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 Py_INCREF(self);
6370 return (PyObject*) self;
6371 }
6372
6373 marg = width - self->length;
6374 left = marg / 2 + (marg & width & 1);
6375
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006376 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377}
6378
Marc-André Lemburge5034372000-08-08 08:04:29 +00006379#if 0
6380
6381/* This code should go into some future Unicode collation support
6382 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006383 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006384
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006385/* speedy UTF-16 code point order comparison */
6386/* gleaned from: */
6387/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6388
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006389static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006390{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006391 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006392 0, 0, 0, 0, 0, 0, 0, 0,
6393 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006394 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006395};
6396
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397static int
6398unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6399{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006400 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 Py_UNICODE *s1 = str1->str;
6403 Py_UNICODE *s2 = str2->str;
6404
6405 len1 = str1->length;
6406 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006407
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006409 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006410
6411 c1 = *s1++;
6412 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006413
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006414 if (c1 > (1<<11) * 26)
6415 c1 += utf16Fixup[c1>>11];
6416 if (c2 > (1<<11) * 26)
6417 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006418 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006419
6420 if (c1 != c2)
6421 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006422
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006423 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 }
6425
6426 return (len1 < len2) ? -1 : (len1 != len2);
6427}
6428
Marc-André Lemburge5034372000-08-08 08:04:29 +00006429#else
6430
6431static int
6432unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006434 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006435
6436 Py_UNICODE *s1 = str1->str;
6437 Py_UNICODE *s2 = str2->str;
6438
6439 len1 = str1->length;
6440 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006441
Marc-André Lemburge5034372000-08-08 08:04:29 +00006442 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006443 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006444
Fredrik Lundh45714e92001-06-26 16:39:36 +00006445 c1 = *s1++;
6446 c2 = *s2++;
6447
6448 if (c1 != c2)
6449 return (c1 < c2) ? -1 : 1;
6450
Marc-André Lemburge5034372000-08-08 08:04:29 +00006451 len1--; len2--;
6452 }
6453
6454 return (len1 < len2) ? -1 : (len1 != len2);
6455}
6456
6457#endif
6458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459int PyUnicode_Compare(PyObject *left,
6460 PyObject *right)
6461{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006462 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6463 return unicode_compare((PyUnicodeObject *)left,
6464 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006465 PyErr_Format(PyExc_TypeError,
6466 "Can't compare %.100s and %.100s",
6467 left->ob_type->tp_name,
6468 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 return -1;
6470}
6471
Martin v. Löwis5b222132007-06-10 09:51:05 +00006472int
6473PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6474{
6475 int i;
6476 Py_UNICODE *id;
6477 assert(PyUnicode_Check(uni));
6478 id = PyUnicode_AS_UNICODE(uni);
6479 /* Compare Unicode string and source character set string */
6480 for (i = 0; id[i] && str[i]; i++)
6481 if (id[i] != str[i])
6482 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6483 if (id[i])
6484 return 1; /* uni is longer */
6485 if (str[i])
6486 return -1; /* str is longer */
6487 return 0;
6488}
6489
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006490PyObject *PyUnicode_RichCompare(PyObject *left,
6491 PyObject *right,
6492 int op)
6493{
6494 int result;
6495
6496 result = PyUnicode_Compare(left, right);
6497 if (result == -1 && PyErr_Occurred())
6498 goto onError;
6499
6500 /* Convert the return value to a Boolean */
6501 switch (op) {
6502 case Py_EQ:
6503 result = (result == 0);
6504 break;
6505 case Py_NE:
6506 result = (result != 0);
6507 break;
6508 case Py_LE:
6509 result = (result <= 0);
6510 break;
6511 case Py_GE:
6512 result = (result >= 0);
6513 break;
6514 case Py_LT:
6515 result = (result == -1);
6516 break;
6517 case Py_GT:
6518 result = (result == 1);
6519 break;
6520 }
6521 return PyBool_FromLong(result);
6522
6523 onError:
6524
6525 /* Standard case
6526
6527 Type errors mean that PyUnicode_FromObject() could not convert
6528 one of the arguments (usually the right hand side) to Unicode,
6529 ie. we can't handle the comparison request. However, it is
6530 possible that the other object knows a comparison method, which
6531 is why we return Py_NotImplemented to give the other object a
6532 chance.
6533
6534 */
6535 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6536 PyErr_Clear();
6537 Py_INCREF(Py_NotImplemented);
6538 return Py_NotImplemented;
6539 }
6540 if (op != Py_EQ && op != Py_NE)
6541 return NULL;
6542
6543 /* Equality comparison.
6544
6545 This is a special case: we silence any PyExc_UnicodeDecodeError
6546 and instead turn it into a PyErr_UnicodeWarning.
6547
6548 */
6549 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6550 return NULL;
6551 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006552 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6553 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006554 "equal comparison "
6555 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006556 "interpreting them as being unequal"
6557 :
6558 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006559 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006560 "interpreting them as being unequal",
6561 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006562 return NULL;
6563 result = (op == Py_NE);
6564 return PyBool_FromLong(result);
6565}
6566
Guido van Rossum403d68b2000-03-13 15:55:09 +00006567int PyUnicode_Contains(PyObject *container,
6568 PyObject *element)
6569{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006572
6573 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006574 sub = PyUnicode_FromObject(element);
6575 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006576 PyErr_Format(PyExc_TypeError,
6577 "'in <string>' requires string as left operand, not %s",
6578 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006580 }
6581
Thomas Wouters477c8d52006-05-27 19:21:47 +00006582 str = PyUnicode_FromObject(container);
6583 if (!str) {
6584 Py_DECREF(sub);
6585 return -1;
6586 }
6587
6588 result = stringlib_contains_obj(str, sub);
6589
6590 Py_DECREF(str);
6591 Py_DECREF(sub);
6592
Guido van Rossum403d68b2000-03-13 15:55:09 +00006593 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006594}
6595
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596/* Concat to string or Unicode object giving a new Unicode object. */
6597
6598PyObject *PyUnicode_Concat(PyObject *left,
6599 PyObject *right)
6600{
6601 PyUnicodeObject *u = NULL, *v = NULL, *w;
6602
6603 /* Coerce the two arguments */
6604 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6605 if (u == NULL)
6606 goto onError;
6607 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6608 if (v == NULL)
6609 goto onError;
6610
6611 /* Shortcuts */
6612 if (v == unicode_empty) {
6613 Py_DECREF(v);
6614 return (PyObject *)u;
6615 }
6616 if (u == unicode_empty) {
6617 Py_DECREF(u);
6618 return (PyObject *)v;
6619 }
6620
6621 /* Concat the two Unicode strings */
6622 w = _PyUnicode_New(u->length + v->length);
6623 if (w == NULL)
6624 goto onError;
6625 Py_UNICODE_COPY(w->str, u->str, u->length);
6626 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6627
6628 Py_DECREF(u);
6629 Py_DECREF(v);
6630 return (PyObject *)w;
6631
6632onError:
6633 Py_XDECREF(u);
6634 Py_XDECREF(v);
6635 return NULL;
6636}
6637
Walter Dörwald1ab83302007-05-18 17:15:44 +00006638void
6639PyUnicode_Append(PyObject **pleft, PyObject *right)
6640{
6641 PyObject *new;
6642 if (*pleft == NULL)
6643 return;
6644 if (right == NULL || !PyUnicode_Check(*pleft)) {
6645 Py_DECREF(*pleft);
6646 *pleft = NULL;
6647 return;
6648 }
6649 new = PyUnicode_Concat(*pleft, right);
6650 Py_DECREF(*pleft);
6651 *pleft = new;
6652}
6653
6654void
6655PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6656{
6657 PyUnicode_Append(pleft, right);
6658 Py_XDECREF(right);
6659}
6660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006661PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662"S.count(sub[, start[, end]]) -> int\n\
6663\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006664Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006665string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
6668static PyObject *
6669unicode_count(PyUnicodeObject *self, PyObject *args)
6670{
6671 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006673 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 PyObject *result;
6675
Guido van Rossumb8872e62000-05-09 14:14:27 +00006676 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6677 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 return NULL;
6679
6680 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 if (substring == NULL)
6683 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006684
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Christian Heimes217cfd12007-12-02 14:31:20 +00006687 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688 stringlib_count(self->str + start, end - start,
6689 substring->str, substring->length)
6690 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
6692 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 return result;
6695}
6696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006698"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006700Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006701to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006702handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6704'xmlcharrefreplace' as well as any other name registered with\n\
6705codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706
6707static PyObject *
6708unicode_encode(PyUnicodeObject *self, PyObject *args)
6709{
6710 char *encoding = NULL;
6711 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006712 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6715 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006716 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006717 if (v == NULL)
6718 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006719 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006720 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006721 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006722 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006723 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006724 Py_DECREF(v);
6725 return NULL;
6726 }
6727 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006728
6729 onError:
6730 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006731}
6732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006734"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735\n\
6736Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
6739static PyObject*
6740unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6741{
6742 Py_UNICODE *e;
6743 Py_UNICODE *p;
6744 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006745 Py_UNICODE *qe;
6746 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 PyUnicodeObject *u;
6748 int tabsize = 8;
6749
6750 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6751 return NULL;
6752
Thomas Wouters7e474022000-07-16 12:04:32 +00006753 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006754 i = 0; /* chars up to and including most recent \n or \r */
6755 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6756 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 for (p = self->str; p < e; p++)
6758 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006759 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006760 incr = tabsize - (j % tabsize); /* cannot overflow */
6761 if (j > PY_SSIZE_T_MAX - incr)
6762 goto overflow1;
6763 j += incr;
6764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
6766 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006767 if (j > PY_SSIZE_T_MAX - 1)
6768 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 j++;
6770 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006771 if (i > PY_SSIZE_T_MAX - j)
6772 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006774 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
6776 }
6777
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006778 if (i > PY_SSIZE_T_MAX - j)
6779 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 /* Second pass: create output string and fill it */
6782 u = _PyUnicode_New(i + j);
6783 if (!u)
6784 return NULL;
6785
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006786 j = 0; /* same as in first pass */
6787 q = u->str; /* next output char */
6788 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
6790 for (p = self->str; p < e; p++)
6791 if (*p == '\t') {
6792 if (tabsize > 0) {
6793 i = tabsize - (j % tabsize);
6794 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006795 while (i--) {
6796 if (q >= qe)
6797 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 }
6801 }
6802 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006803 if (q >= qe)
6804 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006806 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 if (*p == '\n' || *p == '\r')
6808 j = 0;
6809 }
6810
6811 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006812
6813 overflow2:
6814 Py_DECREF(u);
6815 overflow1:
6816 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006821"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822\n\
6823Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006824such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825arguments start and end are interpreted as in slice notation.\n\
6826\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
6829static PyObject *
6830unicode_find(PyUnicodeObject *self, PyObject *args)
6831{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006832 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006833 Py_ssize_t start;
6834 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006835 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
Christian Heimes9cd17752007-11-18 19:35:23 +00006837 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
Thomas Wouters477c8d52006-05-27 19:21:47 +00006840 result = stringlib_find_slice(
6841 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6842 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6843 start, end
6844 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
6846 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006847
Christian Heimes217cfd12007-12-02 14:31:20 +00006848 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
6851static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006852unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
6854 if (index < 0 || index >= self->length) {
6855 PyErr_SetString(PyExc_IndexError, "string index out of range");
6856 return NULL;
6857 }
6858
6859 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6860}
6861
Guido van Rossumc2504932007-09-18 19:42:40 +00006862/* Believe it or not, this produces the same value for ASCII strings
6863 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006865unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
Guido van Rossumc2504932007-09-18 19:42:40 +00006867 Py_ssize_t len;
6868 Py_UNICODE *p;
6869 long x;
6870
6871 if (self->hash != -1)
6872 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006873 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006874 p = self->str;
6875 x = *p << 7;
6876 while (--len >= 0)
6877 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006878 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006879 if (x == -1)
6880 x = -2;
6881 self->hash = x;
6882 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883}
6884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006886"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
6890static PyObject *
6891unicode_index(PyUnicodeObject *self, PyObject *args)
6892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006894 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006895 Py_ssize_t start;
6896 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
Christian Heimes9cd17752007-11-18 19:35:23 +00006898 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
Thomas Wouters477c8d52006-05-27 19:21:47 +00006901 result = stringlib_find_slice(
6902 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6903 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6904 start, end
6905 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906
6907 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 if (result < 0) {
6910 PyErr_SetString(PyExc_ValueError, "substring not found");
6911 return NULL;
6912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006913
Christian Heimes217cfd12007-12-02 14:31:20 +00006914 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915}
6916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
6923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006924unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927 register const Py_UNICODE *e;
6928 int cased;
6929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 /* Shortcut for single character strings */
6931 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006935 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 e = p + PyUnicode_GET_SIZE(self);
6939 cased = 0;
6940 for (; p < e; p++) {
6941 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 else if (!cased && Py_UNICODE_ISLOWER(ch))
6946 cased = 1;
6947 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006954Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
6957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006958unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959{
6960 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6961 register const Py_UNICODE *e;
6962 int cased;
6963
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 /* Shortcut for single character strings */
6965 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006966 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006968 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006969 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006971
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 e = p + PyUnicode_GET_SIZE(self);
6973 cased = 0;
6974 for (; p < e; p++) {
6975 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006976
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 else if (!cased && Py_UNICODE_ISUPPER(ch))
6980 cased = 1;
6981 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006988Return True if S is a titlecased string and there is at least one\n\
6989character in S, i.e. upper- and titlecase characters may only\n\
6990follow uncased characters and lowercase characters only cased ones.\n\
6991Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992
6993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006994unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995{
6996 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6997 register const Py_UNICODE *e;
6998 int cased, previous_is_cased;
6999
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 /* Shortcut for single character strings */
7001 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007002 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7003 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007005 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007006 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007008
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 e = p + PyUnicode_GET_SIZE(self);
7010 cased = 0;
7011 previous_is_cased = 0;
7012 for (; p < e; p++) {
7013 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007014
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7016 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007017 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 previous_is_cased = 1;
7019 cased = 1;
7020 }
7021 else if (Py_UNICODE_ISLOWER(ch)) {
7022 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007023 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 previous_is_cased = 1;
7025 cased = 1;
7026 }
7027 else
7028 previous_is_cased = 0;
7029 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007030 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007033PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007034"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007036Return True if all characters in S are whitespace\n\
7037and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038
7039static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
7042 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7043 register const Py_UNICODE *e;
7044
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 /* Shortcut for single character strings */
7046 if (PyUnicode_GET_SIZE(self) == 1 &&
7047 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007048 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007050 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007051 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007052 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007053
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 e = p + PyUnicode_GET_SIZE(self);
7055 for (; p < e; p++) {
7056 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007057 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007059 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060}
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007063"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007064\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007065Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007067
7068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007069unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007070{
7071 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7072 register const Py_UNICODE *e;
7073
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007074 /* Shortcut for single character strings */
7075 if (PyUnicode_GET_SIZE(self) == 1 &&
7076 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007077 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007078
7079 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007080 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007081 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007082
7083 e = p + PyUnicode_GET_SIZE(self);
7084 for (; p < e; p++) {
7085 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007086 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007087 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007088 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089}
7090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007092"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007093\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007094Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007095and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007096
7097static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007098unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007099{
7100 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7101 register const Py_UNICODE *e;
7102
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007103 /* Shortcut for single character strings */
7104 if (PyUnicode_GET_SIZE(self) == 1 &&
7105 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007106 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007107
7108 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007109 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007110 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007111
7112 e = p + PyUnicode_GET_SIZE(self);
7113 for (; p < e; p++) {
7114 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007115 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007116 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007117 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007118}
7119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007121"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007123Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007124False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125
7126static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007127unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128{
7129 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7130 register const Py_UNICODE *e;
7131
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 /* Shortcut for single character strings */
7133 if (PyUnicode_GET_SIZE(self) == 1 &&
7134 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007135 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007137 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007138 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007140
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 e = p + PyUnicode_GET_SIZE(self);
7142 for (; p < e; p++) {
7143 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007144 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007146 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147}
7148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007149PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007150"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007152Return True if all characters in S are digits\n\
7153and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
7155static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007156unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157{
7158 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7159 register const Py_UNICODE *e;
7160
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 /* Shortcut for single character strings */
7162 if (PyUnicode_GET_SIZE(self) == 1 &&
7163 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007164 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007166 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007167 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007168 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007169
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 e = p + PyUnicode_GET_SIZE(self);
7171 for (; p < e; p++) {
7172 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007173 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007175 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176}
7177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007181Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
7184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007185unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186{
7187 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7188 register const Py_UNICODE *e;
7189
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 /* Shortcut for single character strings */
7191 if (PyUnicode_GET_SIZE(self) == 1 &&
7192 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007193 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007195 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007196 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007197 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007198
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 e = p + PyUnicode_GET_SIZE(self);
7200 for (; p < e; p++) {
7201 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007202 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205}
7206
Martin v. Löwis47383402007-08-15 07:32:56 +00007207int
7208PyUnicode_IsIdentifier(PyObject *self)
7209{
7210 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7211 register const Py_UNICODE *e;
7212
7213 /* Special case for empty strings */
7214 if (PyUnicode_GET_SIZE(self) == 0)
7215 return 0;
7216
7217 /* PEP 3131 says that the first character must be in
7218 XID_Start and subsequent characters in XID_Continue,
7219 and for the ASCII range, the 2.x rules apply (i.e
7220 start with letters and underscore, continue with
7221 letters, digits, underscore). However, given the current
7222 definition of XID_Start and XID_Continue, it is sufficient
7223 to check just for these, except that _ must be allowed
7224 as starting an identifier. */
7225 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7226 return 0;
7227
7228 e = p + PyUnicode_GET_SIZE(self);
7229 for (p++; p < e; p++) {
7230 if (!_PyUnicode_IsXidContinue(*p))
7231 return 0;
7232 }
7233 return 1;
7234}
7235
7236PyDoc_STRVAR(isidentifier__doc__,
7237"S.isidentifier() -> bool\n\
7238\n\
7239Return True if S is a valid identifier according\n\
7240to the language definition.");
7241
7242static PyObject*
7243unicode_isidentifier(PyObject *self)
7244{
7245 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7246}
7247
Georg Brandl559e5d72008-06-11 18:37:52 +00007248PyDoc_STRVAR(isprintable__doc__,
7249"S.isprintable() -> bool\n\
7250\n\
7251Return True if all characters in S are considered\n\
7252printable in repr() or S is empty, False otherwise.");
7253
7254static PyObject*
7255unicode_isprintable(PyObject *self)
7256{
7257 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7258 register const Py_UNICODE *e;
7259
7260 /* Shortcut for single character strings */
7261 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7262 Py_RETURN_TRUE;
7263 }
7264
7265 e = p + PyUnicode_GET_SIZE(self);
7266 for (; p < e; p++) {
7267 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7268 Py_RETURN_FALSE;
7269 }
7270 }
7271 Py_RETURN_TRUE;
7272}
7273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007274PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007275"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276\n\
7277Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007281unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007283 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284}
7285
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287unicode_length(PyUnicodeObject *self)
7288{
7289 return self->length;
7290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007293"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
7295Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007296done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject *
7299unicode_ljust(PyUnicodeObject *self, PyObject *args)
7300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007301 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007302 Py_UNICODE fillchar = ' ';
7303
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007304 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 return NULL;
7306
Tim Peters7a29bd52001-09-12 03:03:31 +00007307 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 Py_INCREF(self);
7309 return (PyObject*) self;
7310 }
7311
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007312 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313}
7314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007315PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007316"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
7320static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007321unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 return fixup(self, fixlower);
7324}
7325
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007326#define LEFTSTRIP 0
7327#define RIGHTSTRIP 1
7328#define BOTHSTRIP 2
7329
7330/* Arrays indexed by above */
7331static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7332
7333#define STRIPNAME(i) (stripformat[i]+3)
7334
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007335/* externally visible for str.strip(unicode) */
7336PyObject *
7337_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7338{
7339 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007341 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007342 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7343 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007344
Thomas Wouters477c8d52006-05-27 19:21:47 +00007345 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7346
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007347 i = 0;
7348 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007349 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7350 i++;
7351 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007352 }
7353
7354 j = len;
7355 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007356 do {
7357 j--;
7358 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7359 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007360 }
7361
7362 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007363 Py_INCREF(self);
7364 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007365 }
7366 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007367 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007368}
7369
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
7371static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007372do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007374 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007375 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007376
7377 i = 0;
7378 if (striptype != RIGHTSTRIP) {
7379 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7380 i++;
7381 }
7382 }
7383
7384 j = len;
7385 if (striptype != LEFTSTRIP) {
7386 do {
7387 j--;
7388 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7389 j++;
7390 }
7391
7392 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7393 Py_INCREF(self);
7394 return (PyObject*)self;
7395 }
7396 else
7397 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398}
7399
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007400
7401static PyObject *
7402do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7403{
7404 PyObject *sep = NULL;
7405
7406 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7407 return NULL;
7408
7409 if (sep != NULL && sep != Py_None) {
7410 if (PyUnicode_Check(sep))
7411 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007412 else {
7413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007414 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007415 STRIPNAME(striptype));
7416 return NULL;
7417 }
7418 }
7419
7420 return do_strip(self, striptype);
7421}
7422
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007425"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007426\n\
7427Return a copy of the string S with leading and trailing\n\
7428whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007429If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007430
7431static PyObject *
7432unicode_strip(PyUnicodeObject *self, PyObject *args)
7433{
7434 if (PyTuple_GET_SIZE(args) == 0)
7435 return do_strip(self, BOTHSTRIP); /* Common case */
7436 else
7437 return do_argstrip(self, BOTHSTRIP, args);
7438}
7439
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007442"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007443\n\
7444Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007445If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007446
7447static PyObject *
7448unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7449{
7450 if (PyTuple_GET_SIZE(args) == 0)
7451 return do_strip(self, LEFTSTRIP); /* Common case */
7452 else
7453 return do_argstrip(self, LEFTSTRIP, args);
7454}
7455
7456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007457PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007458"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007459\n\
7460Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007461If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007462
7463static PyObject *
7464unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7465{
7466 if (PyTuple_GET_SIZE(args) == 0)
7467 return do_strip(self, RIGHTSTRIP); /* Common case */
7468 else
7469 return do_argstrip(self, RIGHTSTRIP, args);
7470}
7471
7472
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007474unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475{
7476 PyUnicodeObject *u;
7477 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007479 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
7481 if (len < 0)
7482 len = 0;
7483
Tim Peters7a29bd52001-09-12 03:03:31 +00007484 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 /* no repeat, return original string */
7486 Py_INCREF(str);
7487 return (PyObject*) str;
7488 }
Tim Peters8f422462000-09-09 06:13:41 +00007489
7490 /* ensure # of chars needed doesn't overflow int and # of bytes
7491 * needed doesn't overflow size_t
7492 */
7493 nchars = len * str->length;
7494 if (len && nchars / len != str->length) {
7495 PyErr_SetString(PyExc_OverflowError,
7496 "repeated string is too long");
7497 return NULL;
7498 }
7499 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7500 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7501 PyErr_SetString(PyExc_OverflowError,
7502 "repeated string is too long");
7503 return NULL;
7504 }
7505 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 if (!u)
7507 return NULL;
7508
7509 p = u->str;
7510
Thomas Wouters477c8d52006-05-27 19:21:47 +00007511 if (str->length == 1 && len > 0) {
7512 Py_UNICODE_FILL(p, str->str[0], len);
7513 } else {
7514 Py_ssize_t done = 0; /* number of characters copied this far */
7515 if (done < nchars) {
7516 Py_UNICODE_COPY(p, str->str, str->length);
7517 done = str->length;
7518 }
7519 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007520 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007521 Py_UNICODE_COPY(p+done, p, n);
7522 done += n;
7523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 }
7525
7526 return (PyObject*) u;
7527}
7528
7529PyObject *PyUnicode_Replace(PyObject *obj,
7530 PyObject *subobj,
7531 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007532 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533{
7534 PyObject *self;
7535 PyObject *str1;
7536 PyObject *str2;
7537 PyObject *result;
7538
7539 self = PyUnicode_FromObject(obj);
7540 if (self == NULL)
7541 return NULL;
7542 str1 = PyUnicode_FromObject(subobj);
7543 if (str1 == NULL) {
7544 Py_DECREF(self);
7545 return NULL;
7546 }
7547 str2 = PyUnicode_FromObject(replobj);
7548 if (str2 == NULL) {
7549 Py_DECREF(self);
7550 Py_DECREF(str1);
7551 return NULL;
7552 }
Tim Petersced69f82003-09-16 20:30:58 +00007553 result = replace((PyUnicodeObject *)self,
7554 (PyUnicodeObject *)str1,
7555 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 maxcount);
7557 Py_DECREF(self);
7558 Py_DECREF(str1);
7559 Py_DECREF(str2);
7560 return result;
7561}
7562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007564"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565\n\
7566Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007567old replaced by new. If the optional argument count is\n\
7568given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
7570static PyObject*
7571unicode_replace(PyUnicodeObject *self, PyObject *args)
7572{
7573 PyUnicodeObject *str1;
7574 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 PyObject *result;
7577
Martin v. Löwis18e16552006-02-15 17:27:45 +00007578 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 return NULL;
7580 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7581 if (str1 == NULL)
7582 return NULL;
7583 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007584 if (str2 == NULL) {
7585 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589 result = replace(self, str1, str2, maxcount);
7590
7591 Py_DECREF(str1);
7592 Py_DECREF(str2);
7593 return result;
7594}
7595
7596static
7597PyObject *unicode_repr(PyObject *unicode)
7598{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007599 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007600 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007601 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7602 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7603
7604 /* XXX(nnorwitz): rather than over-allocating, it would be
7605 better to choose a different scheme. Perhaps scan the
7606 first N-chars of the string and allocate based on that size.
7607 */
7608 /* Initial allocation is based on the longest-possible unichr
7609 escape.
7610
7611 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7612 unichr, so in this case it's the longest unichr escape. In
7613 narrow (UTF-16) builds this is five chars per source unichr
7614 since there are two unichrs in the surrogate pair, so in narrow
7615 (UTF-16) builds it's not the longest unichr escape.
7616
7617 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7618 so in the narrow (UTF-16) build case it's the longest unichr
7619 escape.
7620 */
7621
Walter Dörwald1ab83302007-05-18 17:15:44 +00007622 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007623 2 /* quotes */
7624#ifdef Py_UNICODE_WIDE
7625 + 10*size
7626#else
7627 + 6*size
7628#endif
7629 + 1);
7630 if (repr == NULL)
7631 return NULL;
7632
Walter Dörwald1ab83302007-05-18 17:15:44 +00007633 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007634
7635 /* Add quote */
7636 *p++ = (findchar(s, size, '\'') &&
7637 !findchar(s, size, '"')) ? '"' : '\'';
7638 while (size-- > 0) {
7639 Py_UNICODE ch = *s++;
7640
7641 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007642 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007643 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007644 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007645 continue;
7646 }
7647
Georg Brandl559e5d72008-06-11 18:37:52 +00007648 /* Map special whitespace to '\t', \n', '\r' */
7649 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007650 *p++ = '\\';
7651 *p++ = 't';
7652 }
7653 else if (ch == '\n') {
7654 *p++ = '\\';
7655 *p++ = 'n';
7656 }
7657 else if (ch == '\r') {
7658 *p++ = '\\';
7659 *p++ = 'r';
7660 }
7661
7662 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007663 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007664 *p++ = '\\';
7665 *p++ = 'x';
7666 *p++ = hexdigits[(ch >> 4) & 0x000F];
7667 *p++ = hexdigits[ch & 0x000F];
7668 }
7669
Georg Brandl559e5d72008-06-11 18:37:52 +00007670 /* Copy ASCII characters as-is */
7671 else if (ch < 0x7F) {
7672 *p++ = ch;
7673 }
7674
7675 /* Non-ASCII characters */
7676 else {
7677 Py_UCS4 ucs = ch;
7678
7679#ifndef Py_UNICODE_WIDE
7680 Py_UNICODE ch2 = 0;
7681 /* Get code point from surrogate pair */
7682 if (size > 0) {
7683 ch2 = *s;
7684 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7685 && ch2 <= 0xDFFF) {
7686 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7687 + 0x00010000;
7688 s++;
7689 size--;
7690 }
7691 }
7692#endif
7693 /* Map Unicode whitespace and control characters
7694 (categories Z* and C* except ASCII space)
7695 */
7696 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7697 /* Map 8-bit characters to '\xhh' */
7698 if (ucs <= 0xff) {
7699 *p++ = '\\';
7700 *p++ = 'x';
7701 *p++ = hexdigits[(ch >> 4) & 0x000F];
7702 *p++ = hexdigits[ch & 0x000F];
7703 }
7704 /* Map 21-bit characters to '\U00xxxxxx' */
7705 else if (ucs >= 0x10000) {
7706 *p++ = '\\';
7707 *p++ = 'U';
7708 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7709 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7710 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7711 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7712 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7713 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7714 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7715 *p++ = hexdigits[ucs & 0x0000000F];
7716 }
7717 /* Map 16-bit characters to '\uxxxx' */
7718 else {
7719 *p++ = '\\';
7720 *p++ = 'u';
7721 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7722 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7723 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7724 *p++ = hexdigits[ucs & 0x000F];
7725 }
7726 }
7727 /* Copy characters as-is */
7728 else {
7729 *p++ = ch;
7730#ifndef Py_UNICODE_WIDE
7731 if (ucs >= 0x10000)
7732 *p++ = ch2;
7733#endif
7734 }
7735 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007736 }
7737 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007738 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007739
7740 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007741 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007742 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743}
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007746"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747\n\
7748Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007749such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750arguments start and end are interpreted as in slice notation.\n\
7751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007752Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
7754static PyObject *
7755unicode_rfind(PyUnicodeObject *self, PyObject *args)
7756{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007757 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007758 Py_ssize_t start;
7759 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007760 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761
Christian Heimes9cd17752007-11-18 19:35:23 +00007762 if (!_ParseTupleFinds(args, &substring, &start, &end))
7763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
Thomas Wouters477c8d52006-05-27 19:21:47 +00007765 result = stringlib_rfind_slice(
7766 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7767 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7768 start, end
7769 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
7771 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007772
Christian Heimes217cfd12007-12-02 14:31:20 +00007773 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774}
7775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007776PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007777"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007779Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780
7781static PyObject *
7782unicode_rindex(PyUnicodeObject *self, PyObject *args)
7783{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007784 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007785 Py_ssize_t start;
7786 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007787 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
Christian Heimes9cd17752007-11-18 19:35:23 +00007789 if (!_ParseTupleFinds(args, &substring, &start, &end))
7790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
Thomas Wouters477c8d52006-05-27 19:21:47 +00007792 result = stringlib_rfind_slice(
7793 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7794 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7795 start, end
7796 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
7798 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007799
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 if (result < 0) {
7801 PyErr_SetString(PyExc_ValueError, "substring not found");
7802 return NULL;
7803 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007804 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805}
7806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007807PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007808"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007810Return S right justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007811done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
7813static PyObject *
7814unicode_rjust(PyUnicodeObject *self, PyObject *args)
7815{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007816 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007817 Py_UNICODE fillchar = ' ';
7818
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007819 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 return NULL;
7821
Tim Peters7a29bd52001-09-12 03:03:31 +00007822 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 Py_INCREF(self);
7824 return (PyObject*) self;
7825 }
7826
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007827 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828}
7829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830PyObject *PyUnicode_Split(PyObject *s,
7831 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007832 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833{
7834 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007835
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 s = PyUnicode_FromObject(s);
7837 if (s == NULL)
7838 return NULL;
7839 if (sep != NULL) {
7840 sep = PyUnicode_FromObject(sep);
7841 if (sep == NULL) {
7842 Py_DECREF(s);
7843 return NULL;
7844 }
7845 }
7846
7847 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7848
7849 Py_DECREF(s);
7850 Py_XDECREF(sep);
7851 return result;
7852}
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007855"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856\n\
7857Return a list of the words in S, using sep as the\n\
7858delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007859splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007860whitespace string is a separator and empty strings are\n\
7861removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
7863static PyObject*
7864unicode_split(PyUnicodeObject *self, PyObject *args)
7865{
7866 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 return NULL;
7871
7872 if (substring == Py_None)
7873 return split(self, NULL, maxcount);
7874 else if (PyUnicode_Check(substring))
7875 return split(self, (PyUnicodeObject *)substring, maxcount);
7876 else
7877 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7878}
7879
Thomas Wouters477c8d52006-05-27 19:21:47 +00007880PyObject *
7881PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7882{
7883 PyObject* str_obj;
7884 PyObject* sep_obj;
7885 PyObject* out;
7886
7887 str_obj = PyUnicode_FromObject(str_in);
7888 if (!str_obj)
7889 return NULL;
7890 sep_obj = PyUnicode_FromObject(sep_in);
7891 if (!sep_obj) {
7892 Py_DECREF(str_obj);
7893 return NULL;
7894 }
7895
7896 out = stringlib_partition(
7897 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7898 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7899 );
7900
7901 Py_DECREF(sep_obj);
7902 Py_DECREF(str_obj);
7903
7904 return out;
7905}
7906
7907
7908PyObject *
7909PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7910{
7911 PyObject* str_obj;
7912 PyObject* sep_obj;
7913 PyObject* out;
7914
7915 str_obj = PyUnicode_FromObject(str_in);
7916 if (!str_obj)
7917 return NULL;
7918 sep_obj = PyUnicode_FromObject(sep_in);
7919 if (!sep_obj) {
7920 Py_DECREF(str_obj);
7921 return NULL;
7922 }
7923
7924 out = stringlib_rpartition(
7925 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7926 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7927 );
7928
7929 Py_DECREF(sep_obj);
7930 Py_DECREF(str_obj);
7931
7932 return out;
7933}
7934
7935PyDoc_STRVAR(partition__doc__,
7936"S.partition(sep) -> (head, sep, tail)\n\
7937\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007938Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007939the separator itself, and the part after it. If the separator is not\n\
7940found, returns S and two empty strings.");
7941
7942static PyObject*
7943unicode_partition(PyUnicodeObject *self, PyObject *separator)
7944{
7945 return PyUnicode_Partition((PyObject *)self, separator);
7946}
7947
7948PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007949"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007950\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007951Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007953separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954
7955static PyObject*
7956unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7957{
7958 return PyUnicode_RPartition((PyObject *)self, separator);
7959}
7960
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007961PyObject *PyUnicode_RSplit(PyObject *s,
7962 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007963 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007964{
7965 PyObject *result;
7966
7967 s = PyUnicode_FromObject(s);
7968 if (s == NULL)
7969 return NULL;
7970 if (sep != NULL) {
7971 sep = PyUnicode_FromObject(sep);
7972 if (sep == NULL) {
7973 Py_DECREF(s);
7974 return NULL;
7975 }
7976 }
7977
7978 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7979
7980 Py_DECREF(s);
7981 Py_XDECREF(sep);
7982 return result;
7983}
7984
7985PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007986"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007987\n\
7988Return a list of the words in S, using sep as the\n\
7989delimiter string, starting at the end of the string and\n\
7990working to the front. If maxsplit is given, at most maxsplit\n\
7991splits are done. If sep is not specified, any whitespace string\n\
7992is a separator.");
7993
7994static PyObject*
7995unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7996{
7997 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007998 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007999
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008001 return NULL;
8002
8003 if (substring == Py_None)
8004 return rsplit(self, NULL, maxcount);
8005 else if (PyUnicode_Check(substring))
8006 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8007 else
8008 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8009}
8010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008011PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00008012"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013\n\
8014Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008015Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008016is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017
8018static PyObject*
8019unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8020{
Guido van Rossum86662912000-04-11 15:38:46 +00008021 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022
Guido van Rossum86662912000-04-11 15:38:46 +00008023 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return NULL;
8025
Guido van Rossum86662912000-04-11 15:38:46 +00008026 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027}
8028
8029static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008030PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031{
Walter Dörwald346737f2007-05-31 10:44:43 +00008032 if (PyUnicode_CheckExact(self)) {
8033 Py_INCREF(self);
8034 return self;
8035 } else
8036 /* Subtype -- return genuine unicode string with the same value. */
8037 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8038 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039}
8040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008041PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008042"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043\n\
8044Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008045and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
8047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008048unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return fixup(self, fixswapcase);
8051}
8052
Georg Brandlceee0772007-11-27 23:48:05 +00008053PyDoc_STRVAR(maketrans__doc__,
8054"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8055\n\
8056Return a translation table usable for str.translate().\n\
8057If there is only one argument, it must be a dictionary mapping Unicode\n\
8058ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008059Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008060If there are two arguments, they must be strings of equal length, and\n\
8061in the resulting dictionary, each character in x will be mapped to the\n\
8062character at the same position in y. If there is a third argument, it\n\
8063must be a string, whose characters will be mapped to None in the result.");
8064
8065static PyObject*
8066unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8067{
8068 PyObject *x, *y = NULL, *z = NULL;
8069 PyObject *new = NULL, *key, *value;
8070 Py_ssize_t i = 0;
8071 int res;
8072
8073 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8074 return NULL;
8075 new = PyDict_New();
8076 if (!new)
8077 return NULL;
8078 if (y != NULL) {
8079 /* x must be a string too, of equal length */
8080 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8081 if (!PyUnicode_Check(x)) {
8082 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8083 "be a string if there is a second argument");
8084 goto err;
8085 }
8086 if (PyUnicode_GET_SIZE(x) != ylen) {
8087 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8088 "arguments must have equal length");
8089 goto err;
8090 }
8091 /* create entries for translating chars in x to those in y */
8092 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008093 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8094 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008095 if (!key || !value)
8096 goto err;
8097 res = PyDict_SetItem(new, key, value);
8098 Py_DECREF(key);
8099 Py_DECREF(value);
8100 if (res < 0)
8101 goto err;
8102 }
8103 /* create entries for deleting chars in z */
8104 if (z != NULL) {
8105 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008106 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008107 if (!key)
8108 goto err;
8109 res = PyDict_SetItem(new, key, Py_None);
8110 Py_DECREF(key);
8111 if (res < 0)
8112 goto err;
8113 }
8114 }
8115 } else {
8116 /* x must be a dict */
8117 if (!PyDict_Check(x)) {
8118 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8119 "to maketrans it must be a dict");
8120 goto err;
8121 }
8122 /* copy entries into the new dict, converting string keys to int keys */
8123 while (PyDict_Next(x, &i, &key, &value)) {
8124 if (PyUnicode_Check(key)) {
8125 /* convert string keys to integer keys */
8126 PyObject *newkey;
8127 if (PyUnicode_GET_SIZE(key) != 1) {
8128 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8129 "table must be of length 1");
8130 goto err;
8131 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008132 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008133 if (!newkey)
8134 goto err;
8135 res = PyDict_SetItem(new, newkey, value);
8136 Py_DECREF(newkey);
8137 if (res < 0)
8138 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008139 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008140 /* just keep integer keys */
8141 if (PyDict_SetItem(new, key, value) < 0)
8142 goto err;
8143 } else {
8144 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8145 "be strings or integers");
8146 goto err;
8147 }
8148 }
8149 }
8150 return new;
8151 err:
8152 Py_DECREF(new);
8153 return NULL;
8154}
8155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008156PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008157"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158\n\
8159Return a copy of the string S, where all characters have been mapped\n\
8160through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008161Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008162Unmapped characters are left untouched. Characters mapped to None\n\
8163are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
8165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008166unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167{
Georg Brandlceee0772007-11-27 23:48:05 +00008168 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169}
8170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008171PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008172"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175
8176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008177unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 return fixup(self, fixupper);
8180}
8181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008182PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008183"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184\n\
8185Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008186of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187
8188static PyObject *
8189unicode_zfill(PyUnicodeObject *self, PyObject *args)
8190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008191 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 PyUnicodeObject *u;
8193
Martin v. Löwis18e16552006-02-15 17:27:45 +00008194 Py_ssize_t width;
8195 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 return NULL;
8197
8198 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008199 if (PyUnicode_CheckExact(self)) {
8200 Py_INCREF(self);
8201 return (PyObject*) self;
8202 }
8203 else
8204 return PyUnicode_FromUnicode(
8205 PyUnicode_AS_UNICODE(self),
8206 PyUnicode_GET_SIZE(self)
8207 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 }
8209
8210 fill = width - self->length;
8211
8212 u = pad(self, fill, 0, '0');
8213
Walter Dörwald068325e2002-04-15 13:36:47 +00008214 if (u == NULL)
8215 return NULL;
8216
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 if (u->str[fill] == '+' || u->str[fill] == '-') {
8218 /* move sign to beginning of string */
8219 u->str[0] = u->str[fill];
8220 u->str[fill] = '0';
8221 }
8222
8223 return (PyObject*) u;
8224}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
8226#if 0
8227static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008228unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229{
Christian Heimes2202f872008-02-06 14:31:34 +00008230 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231}
8232#endif
8233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008234PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008235"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008237Return True if S starts with the specified prefix, False otherwise.\n\
8238With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008239With optional end, stop comparing S at that position.\n\
8240prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241
8242static PyObject *
8243unicode_startswith(PyUnicodeObject *self,
8244 PyObject *args)
8245{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008246 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008248 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008249 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008250 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008252 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008253 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008255 if (PyTuple_Check(subobj)) {
8256 Py_ssize_t i;
8257 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8258 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8259 PyTuple_GET_ITEM(subobj, i));
8260 if (substring == NULL)
8261 return NULL;
8262 result = tailmatch(self, substring, start, end, -1);
8263 Py_DECREF(substring);
8264 if (result) {
8265 Py_RETURN_TRUE;
8266 }
8267 }
8268 /* nothing matched */
8269 Py_RETURN_FALSE;
8270 }
8271 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008273 return NULL;
8274 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008276 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277}
8278
8279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008280PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008281"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008283Return True if S ends with the specified suffix, False otherwise.\n\
8284With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008285With optional end, stop comparing S at that position.\n\
8286suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287
8288static PyObject *
8289unicode_endswith(PyUnicodeObject *self,
8290 PyObject *args)
8291{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008292 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008294 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008295 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008296 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008298 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8299 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008301 if (PyTuple_Check(subobj)) {
8302 Py_ssize_t i;
8303 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8304 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8305 PyTuple_GET_ITEM(subobj, i));
8306 if (substring == NULL)
8307 return NULL;
8308 result = tailmatch(self, substring, start, end, +1);
8309 Py_DECREF(substring);
8310 if (result) {
8311 Py_RETURN_TRUE;
8312 }
8313 }
8314 Py_RETURN_FALSE;
8315 }
8316 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008320 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008322 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323}
8324
Eric Smith8c663262007-08-25 02:26:07 +00008325#include "stringlib/string_format.h"
8326
8327PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008328"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008329\n\
8330");
8331
Eric Smith4a7d76d2008-05-30 18:10:19 +00008332static PyObject *
8333unicode__format__(PyObject* self, PyObject* args)
8334{
8335 PyObject *format_spec;
8336
8337 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8338 return NULL;
8339
8340 return _PyUnicode_FormatAdvanced(self,
8341 PyUnicode_AS_UNICODE(format_spec),
8342 PyUnicode_GET_SIZE(format_spec));
8343}
8344
Eric Smith8c663262007-08-25 02:26:07 +00008345PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008346"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008347\n\
8348");
8349
8350static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008351unicode__sizeof__(PyUnicodeObject *v)
8352{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008353 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8354 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008355}
8356
8357PyDoc_STRVAR(sizeof__doc__,
8358"S.__sizeof__() -> size of S in memory, in bytes");
8359
8360static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008361unicode_getnewargs(PyUnicodeObject *v)
8362{
8363 return Py_BuildValue("(u#)", v->str, v->length);
8364}
8365
8366
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367static PyMethodDef unicode_methods[] = {
8368
8369 /* Order is according to common usage: often used methods should
8370 appear first, since lookup is done sequentially. */
8371
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008372 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8373 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8374 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008375 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008376 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8377 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8378 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8379 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8380 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8381 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8382 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008384 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8385 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8386 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008387 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008388 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8389 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8390 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008391 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008393 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008394 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008395 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8396 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8397 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8398 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8399 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8400 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8401 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8402 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8403 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8404 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8405 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8406 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8407 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8408 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008409 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008410 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008411 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008412 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008413 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008414 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8415 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008416 {"maketrans", (PyCFunction) unicode_maketrans,
8417 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008418 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008419#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008420 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421#endif
8422
8423#if 0
8424 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008425 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426#endif
8427
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008428 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 {NULL, NULL}
8430};
8431
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008432static PyObject *
8433unicode_mod(PyObject *v, PyObject *w)
8434{
8435 if (!PyUnicode_Check(v)) {
8436 Py_INCREF(Py_NotImplemented);
8437 return Py_NotImplemented;
8438 }
8439 return PyUnicode_Format(v, w);
8440}
8441
8442static PyNumberMethods unicode_as_number = {
8443 0, /*nb_add*/
8444 0, /*nb_subtract*/
8445 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008446 unicode_mod, /*nb_remainder*/
8447};
8448
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008451 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8453 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008454 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 0, /* sq_ass_item */
8456 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008457 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458};
8459
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008460static PyObject*
8461unicode_subscript(PyUnicodeObject* self, PyObject* item)
8462{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008463 if (PyIndex_Check(item)) {
8464 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008465 if (i == -1 && PyErr_Occurred())
8466 return NULL;
8467 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008468 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008469 return unicode_getitem(self, i);
8470 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008471 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008472 Py_UNICODE* source_buf;
8473 Py_UNICODE* result_buf;
8474 PyObject* result;
8475
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008476 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008477 &start, &stop, &step, &slicelength) < 0) {
8478 return NULL;
8479 }
8480
8481 if (slicelength <= 0) {
8482 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008483 } else if (start == 0 && step == 1 && slicelength == self->length &&
8484 PyUnicode_CheckExact(self)) {
8485 Py_INCREF(self);
8486 return (PyObject *)self;
8487 } else if (step == 1) {
8488 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008489 } else {
8490 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008491 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8492 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008493
8494 if (result_buf == NULL)
8495 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008496
8497 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8498 result_buf[i] = source_buf[cur];
8499 }
Tim Petersced69f82003-09-16 20:30:58 +00008500
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008501 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008502 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008503 return result;
8504 }
8505 } else {
8506 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8507 return NULL;
8508 }
8509}
8510
8511static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008512 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008513 (binaryfunc)unicode_subscript, /* mp_subscript */
8514 (objobjargproc)0, /* mp_ass_subscript */
8515};
8516
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518/* Helpers for PyUnicode_Format() */
8519
8520static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008521getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 if (argidx < arglen) {
8525 (*p_argidx)++;
8526 if (arglen < 0)
8527 return args;
8528 else
8529 return PyTuple_GetItem(args, argidx);
8530 }
8531 PyErr_SetString(PyExc_TypeError,
8532 "not enough arguments for format string");
8533 return NULL;
8534}
8535
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008537strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 register Py_ssize_t i;
8540 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 for (i = len - 1; i >= 0; i--)
8542 buffer[i] = (Py_UNICODE) charbuffer[i];
8543
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 return len;
8545}
8546
Neal Norwitzfc76d632006-01-10 06:03:13 +00008547static int
8548doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8549{
Tim Peters15231542006-02-16 01:08:01 +00008550 Py_ssize_t result;
8551
Neal Norwitzfc76d632006-01-10 06:03:13 +00008552 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008553 result = strtounicode(buffer, (char *)buffer);
8554 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008555}
8556
Christian Heimes3fd13992008-03-21 01:05:49 +00008557#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008558static int
8559longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8560{
Tim Peters15231542006-02-16 01:08:01 +00008561 Py_ssize_t result;
8562
Neal Norwitzfc76d632006-01-10 06:03:13 +00008563 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008564 result = strtounicode(buffer, (char *)buffer);
8565 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008566}
Christian Heimes3fd13992008-03-21 01:05:49 +00008567#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008568
Guido van Rossum078151d2002-08-11 04:24:12 +00008569/* XXX To save some code duplication, formatfloat/long/int could have been
8570 shared with stringobject.c, converting from 8-bit to Unicode after the
8571 formatting is done. */
8572
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573static int
8574formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008575 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 int flags,
8577 int prec,
8578 int type,
8579 PyObject *v)
8580{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008581 /* fmt = '%#.' + `prec` + `type`
8582 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 char fmt[20];
8584 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008585
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 x = PyFloat_AsDouble(v);
8587 if (x == -1.0 && PyErr_Occurred())
8588 return -1;
8589 if (prec < 0)
8590 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008591 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8592 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008593 /* Worst case length calc to ensure no buffer overrun:
8594
8595 'g' formats:
8596 fmt = %#.<prec>g
8597 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8598 for any double rep.)
8599 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8600
8601 'f' formats:
8602 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8603 len = 1 + 50 + 1 + prec = 52 + prec
8604
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008605 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008606 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008607
8608 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008609 if (((type == 'g' || type == 'G') &&
8610 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008611 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008612 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008613 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008614 return -1;
8615 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008616 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8617 (flags&F_ALT) ? "#" : "",
8618 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008619 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620}
8621
Tim Peters38fd5b62000-09-21 05:43:11 +00008622static PyObject*
8623formatlong(PyObject *val, int flags, int prec, int type)
8624{
8625 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008626 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008627 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008628 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008629
Christian Heimes72b710a2008-05-26 13:28:38 +00008630 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008631 if (!str)
8632 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008633 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008634 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008635 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008636}
8637
Christian Heimes3fd13992008-03-21 01:05:49 +00008638#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639static int
8640formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008641 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 int flags,
8643 int prec,
8644 int type,
8645 PyObject *v)
8646{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008647 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008648 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8649 * + 1 + 1
8650 * = 24
8651 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008652 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008653 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 long x;
8655
Christian Heimes217cfd12007-12-02 14:31:20 +00008656 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008658 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008659 if (x < 0 && type == 'u') {
8660 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008661 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008662 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8663 sign = "-";
8664 else
8665 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008667 prec = 1;
8668
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008669 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8670 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008671 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008672 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008673 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008674 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008675 return -1;
8676 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008677
8678 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008679 (type == 'x' || type == 'X' || type == 'o')) {
8680 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008681 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008682 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008683 * - when 0 is being converted, the C standard leaves off
8684 * the '0x' or '0X', which is inconsistent with other
8685 * %#x/%#X conversions and inconsistent with Python's
8686 * hex() function
8687 * - there are platforms that violate the standard and
8688 * convert 0 with the '0x' or '0X'
8689 * (Metrowerks, Compaq Tru64)
8690 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008691 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008692 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008693 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008694 * We can achieve the desired consistency by inserting our
8695 * own '0x' or '0X' prefix, and substituting %x/%X in place
8696 * of %#x/%#X.
8697 *
8698 * Note that this is the same approach as used in
8699 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008700 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008701 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8702 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008703 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008704 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008705 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8706 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008707 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008708 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008709 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008710 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008711 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008712 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713}
Christian Heimes3fd13992008-03-21 01:05:49 +00008714#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715
8716static int
8717formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008718 size_t buflen,
8719 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008721 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008722 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008723 if (PyUnicode_GET_SIZE(v) == 1) {
8724 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8725 buf[1] = '\0';
8726 return 1;
8727 }
8728#ifndef Py_UNICODE_WIDE
8729 if (PyUnicode_GET_SIZE(v) == 2) {
8730 /* Decode a valid surrogate pair */
8731 int c0 = PyUnicode_AS_UNICODE(v)[0];
8732 int c1 = PyUnicode_AS_UNICODE(v)[1];
8733 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8734 0xDC00 <= c1 && c1 <= 0xDFFF) {
8735 buf[0] = c0;
8736 buf[1] = c1;
8737 buf[2] = '\0';
8738 return 2;
8739 }
8740 }
8741#endif
8742 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 else {
8745 /* Integer input truncated to a character */
8746 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008747 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008749 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008750
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008751 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008752 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008753 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008754 return -1;
8755 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008756
8757#ifndef Py_UNICODE_WIDE
8758 if (x > 0xffff) {
8759 x -= 0x10000;
8760 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8761 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8762 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008763 }
8764#endif
8765 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008766 buf[1] = '\0';
8767 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008769
8770 onError:
8771 PyErr_SetString(PyExc_TypeError,
8772 "%c requires int or char");
8773 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774}
8775
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008776/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8777
8778 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8779 chars are formatted. XXX This is a magic number. Each formatting
8780 routine does bounds checking to ensure no overflow, but a better
8781 solution may be to malloc a buffer of appropriate size for each
8782 format. For now, the current solution is sufficient.
8783*/
8784#define FORMATBUFLEN (size_t)120
8785
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786PyObject *PyUnicode_Format(PyObject *format,
8787 PyObject *args)
8788{
8789 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008790 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 int args_owned = 0;
8792 PyUnicodeObject *result = NULL;
8793 PyObject *dict = NULL;
8794 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008795
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 if (format == NULL || args == NULL) {
8797 PyErr_BadInternalCall();
8798 return NULL;
8799 }
8800 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008801 if (uformat == NULL)
8802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 fmt = PyUnicode_AS_UNICODE(uformat);
8804 fmtcnt = PyUnicode_GET_SIZE(uformat);
8805
8806 reslen = rescnt = fmtcnt + 100;
8807 result = _PyUnicode_New(reslen);
8808 if (result == NULL)
8809 goto onError;
8810 res = PyUnicode_AS_UNICODE(result);
8811
8812 if (PyTuple_Check(args)) {
8813 arglen = PyTuple_Size(args);
8814 argidx = 0;
8815 }
8816 else {
8817 arglen = -1;
8818 argidx = -2;
8819 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008820 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008821 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 dict = args;
8823
8824 while (--fmtcnt >= 0) {
8825 if (*fmt != '%') {
8826 if (--rescnt < 0) {
8827 rescnt = fmtcnt + 100;
8828 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008829 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8832 --rescnt;
8833 }
8834 *res++ = *fmt++;
8835 }
8836 else {
8837 /* Got a format specifier */
8838 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008839 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 Py_UNICODE c = '\0';
8842 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008843 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 PyObject *v = NULL;
8845 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008846 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008848 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008849 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850
8851 fmt++;
8852 if (*fmt == '(') {
8853 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008854 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 PyObject *key;
8856 int pcount = 1;
8857
8858 if (dict == NULL) {
8859 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008860 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 goto onError;
8862 }
8863 ++fmt;
8864 --fmtcnt;
8865 keystart = fmt;
8866 /* Skip over balanced parentheses */
8867 while (pcount > 0 && --fmtcnt >= 0) {
8868 if (*fmt == ')')
8869 --pcount;
8870 else if (*fmt == '(')
8871 ++pcount;
8872 fmt++;
8873 }
8874 keylen = fmt - keystart - 1;
8875 if (fmtcnt < 0 || pcount > 0) {
8876 PyErr_SetString(PyExc_ValueError,
8877 "incomplete format key");
8878 goto onError;
8879 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008880#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008881 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 then looked up since Python uses strings to hold
8883 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008884 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 key = PyUnicode_EncodeUTF8(keystart,
8886 keylen,
8887 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008888#else
8889 key = PyUnicode_FromUnicode(keystart, keylen);
8890#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 if (key == NULL)
8892 goto onError;
8893 if (args_owned) {
8894 Py_DECREF(args);
8895 args_owned = 0;
8896 }
8897 args = PyObject_GetItem(dict, key);
8898 Py_DECREF(key);
8899 if (args == NULL) {
8900 goto onError;
8901 }
8902 args_owned = 1;
8903 arglen = -1;
8904 argidx = -2;
8905 }
8906 while (--fmtcnt >= 0) {
8907 switch (c = *fmt++) {
8908 case '-': flags |= F_LJUST; continue;
8909 case '+': flags |= F_SIGN; continue;
8910 case ' ': flags |= F_BLANK; continue;
8911 case '#': flags |= F_ALT; continue;
8912 case '0': flags |= F_ZERO; continue;
8913 }
8914 break;
8915 }
8916 if (c == '*') {
8917 v = getnextarg(args, arglen, &argidx);
8918 if (v == NULL)
8919 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008920 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 PyErr_SetString(PyExc_TypeError,
8922 "* wants int");
8923 goto onError;
8924 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008925 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008926 if (width == -1 && PyErr_Occurred())
8927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 if (width < 0) {
8929 flags |= F_LJUST;
8930 width = -width;
8931 }
8932 if (--fmtcnt >= 0)
8933 c = *fmt++;
8934 }
8935 else if (c >= '0' && c <= '9') {
8936 width = c - '0';
8937 while (--fmtcnt >= 0) {
8938 c = *fmt++;
8939 if (c < '0' || c > '9')
8940 break;
8941 if ((width*10) / 10 != width) {
8942 PyErr_SetString(PyExc_ValueError,
8943 "width too big");
8944 goto onError;
8945 }
8946 width = width*10 + (c - '0');
8947 }
8948 }
8949 if (c == '.') {
8950 prec = 0;
8951 if (--fmtcnt >= 0)
8952 c = *fmt++;
8953 if (c == '*') {
8954 v = getnextarg(args, arglen, &argidx);
8955 if (v == NULL)
8956 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008957 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 PyErr_SetString(PyExc_TypeError,
8959 "* wants int");
8960 goto onError;
8961 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008962 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008963 if (prec == -1 && PyErr_Occurred())
8964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 if (prec < 0)
8966 prec = 0;
8967 if (--fmtcnt >= 0)
8968 c = *fmt++;
8969 }
8970 else if (c >= '0' && c <= '9') {
8971 prec = c - '0';
8972 while (--fmtcnt >= 0) {
8973 c = Py_CHARMASK(*fmt++);
8974 if (c < '0' || c > '9')
8975 break;
8976 if ((prec*10) / 10 != prec) {
8977 PyErr_SetString(PyExc_ValueError,
8978 "prec too big");
8979 goto onError;
8980 }
8981 prec = prec*10 + (c - '0');
8982 }
8983 }
8984 } /* prec */
8985 if (fmtcnt >= 0) {
8986 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 if (--fmtcnt >= 0)
8988 c = *fmt++;
8989 }
8990 }
8991 if (fmtcnt < 0) {
8992 PyErr_SetString(PyExc_ValueError,
8993 "incomplete format");
8994 goto onError;
8995 }
8996 if (c != '%') {
8997 v = getnextarg(args, arglen, &argidx);
8998 if (v == NULL)
8999 goto onError;
9000 }
9001 sign = 0;
9002 fill = ' ';
9003 switch (c) {
9004
9005 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009006 pbuf = formatbuf;
9007 /* presume that buffer length is at least 1 */
9008 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 len = 1;
9010 break;
9011
9012 case 's':
9013 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009014 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 if (PyUnicode_Check(v) && c == 's') {
9016 temp = v;
9017 Py_INCREF(temp);
9018 }
9019 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009021 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009022 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009024 else
9025 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 if (temp == NULL)
9027 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009028 if (PyUnicode_Check(temp))
9029 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009030 else {
9031 Py_DECREF(temp);
9032 PyErr_SetString(PyExc_TypeError,
9033 "%s argument has non-string str()");
9034 goto onError;
9035 }
9036 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009037 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 len = PyUnicode_GET_SIZE(temp);
9039 if (prec >= 0 && len > prec)
9040 len = prec;
9041 break;
9042
9043 case 'i':
9044 case 'd':
9045 case 'u':
9046 case 'o':
9047 case 'x':
9048 case 'X':
9049 if (c == 'i')
9050 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009051 isnumok = 0;
9052 if (PyNumber_Check(v)) {
9053 PyObject *iobj=NULL;
9054
9055 if (PyLong_Check(v)) {
9056 iobj = v;
9057 Py_INCREF(iobj);
9058 }
9059 else {
9060 iobj = PyNumber_Long(v);
9061 }
9062 if (iobj!=NULL) {
9063 if (PyLong_Check(iobj)) {
9064 isnumok = 1;
9065 temp = formatlong(iobj, flags, prec, c);
9066 Py_DECREF(iobj);
9067 if (!temp)
9068 goto onError;
9069 pbuf = PyUnicode_AS_UNICODE(temp);
9070 len = PyUnicode_GET_SIZE(temp);
9071 sign = 1;
9072 }
9073 else {
9074 Py_DECREF(iobj);
9075 }
9076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009078 if (!isnumok) {
9079 PyErr_Format(PyExc_TypeError,
9080 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009081 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009082 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009083 }
9084 if (flags & F_ZERO)
9085 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 break;
9087
9088 case 'e':
9089 case 'E':
9090 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009091 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 case 'g':
9093 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009094 if (c == 'F')
9095 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009096 pbuf = formatbuf;
9097 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9098 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 if (len < 0)
9100 goto onError;
9101 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009102 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 fill = '0';
9104 break;
9105
9106 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009107 pbuf = formatbuf;
9108 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 if (len < 0)
9110 goto onError;
9111 break;
9112
9113 default:
9114 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009115 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009116 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009117 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009118 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009119 (Py_ssize_t)(fmt - 1 -
9120 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 goto onError;
9122 }
9123 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009124 if (*pbuf == '-' || *pbuf == '+') {
9125 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 len--;
9127 }
9128 else if (flags & F_SIGN)
9129 sign = '+';
9130 else if (flags & F_BLANK)
9131 sign = ' ';
9132 else
9133 sign = 0;
9134 }
9135 if (width < len)
9136 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009137 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 reslen -= rescnt;
9139 rescnt = width + fmtcnt + 100;
9140 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009141 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009142 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009143 PyErr_NoMemory();
9144 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009145 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009146 if (_PyUnicode_Resize(&result, reslen) < 0) {
9147 Py_XDECREF(temp);
9148 goto onError;
9149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 res = PyUnicode_AS_UNICODE(result)
9151 + reslen - rescnt;
9152 }
9153 if (sign) {
9154 if (fill != ' ')
9155 *res++ = sign;
9156 rescnt--;
9157 if (width > len)
9158 width--;
9159 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009160 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009161 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009162 assert(pbuf[1] == c);
9163 if (fill != ' ') {
9164 *res++ = *pbuf++;
9165 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009166 }
Tim Petersfff53252001-04-12 18:38:48 +00009167 rescnt -= 2;
9168 width -= 2;
9169 if (width < 0)
9170 width = 0;
9171 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 if (width > len && !(flags & F_LJUST)) {
9174 do {
9175 --rescnt;
9176 *res++ = fill;
9177 } while (--width > len);
9178 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009179 if (fill == ' ') {
9180 if (sign)
9181 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009182 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009183 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009184 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009185 *res++ = *pbuf++;
9186 *res++ = *pbuf++;
9187 }
9188 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009189 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190 res += len;
9191 rescnt -= len;
9192 while (--width >= len) {
9193 --rescnt;
9194 *res++ = ' ';
9195 }
9196 if (dict && (argidx < arglen) && c != '%') {
9197 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009198 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009199 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200 goto onError;
9201 }
9202 Py_XDECREF(temp);
9203 } /* '%' */
9204 } /* until end */
9205 if (argidx < arglen && !dict) {
9206 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009207 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 goto onError;
9209 }
9210
Thomas Woutersa96affe2006-03-12 00:29:36 +00009211 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9212 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 if (args_owned) {
9214 Py_DECREF(args);
9215 }
9216 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return (PyObject *)result;
9218
9219 onError:
9220 Py_XDECREF(result);
9221 Py_DECREF(uformat);
9222 if (args_owned) {
9223 Py_DECREF(args);
9224 }
9225 return NULL;
9226}
9227
Jeremy Hylton938ace62002-07-17 16:30:39 +00009228static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009229unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9230
Tim Peters6d6c1a32001-08-02 04:15:00 +00009231static PyObject *
9232unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9233{
9234 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009235 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009236 char *encoding = NULL;
9237 char *errors = NULL;
9238
Guido van Rossume023fe02001-08-30 03:12:59 +00009239 if (type != &PyUnicode_Type)
9240 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009241 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009242 kwlist, &x, &encoding, &errors))
9243 return NULL;
9244 if (x == NULL)
9245 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009246 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009247 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009248 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009249 return PyUnicode_FromEncodedObject(x, encoding, errors);
9250}
9251
Guido van Rossume023fe02001-08-30 03:12:59 +00009252static PyObject *
9253unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9254{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009255 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009256 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009257
9258 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9259 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9260 if (tmp == NULL)
9261 return NULL;
9262 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009263 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009264 if (pnew == NULL) {
9265 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009266 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009267 }
Christian Heimesb186d002008-03-18 15:15:01 +00009268 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009269 if (pnew->str == NULL) {
9270 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009271 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009272 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009273 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009274 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009275 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9276 pnew->length = n;
9277 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009278 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009279 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009280}
9281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009282PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009283"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009284\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009285Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009286encoding defaults to the current default string encoding.\n\
9287errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009288
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009289static PyObject *unicode_iter(PyObject *seq);
9290
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009292 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009293 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 sizeof(PyUnicodeObject), /* tp_size */
9295 0, /* tp_itemsize */
9296 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009297 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009299 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009301 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009302 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009303 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009305 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 (hashfunc) unicode_hash, /* tp_hash*/
9307 0, /* tp_call*/
9308 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009309 PyObject_GenericGetAttr, /* tp_getattro */
9310 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009311 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009312 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9313 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009314 unicode_doc, /* tp_doc */
9315 0, /* tp_traverse */
9316 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009317 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009318 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009319 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009320 0, /* tp_iternext */
9321 unicode_methods, /* tp_methods */
9322 0, /* tp_members */
9323 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009324 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009325 0, /* tp_dict */
9326 0, /* tp_descr_get */
9327 0, /* tp_descr_set */
9328 0, /* tp_dictoffset */
9329 0, /* tp_init */
9330 0, /* tp_alloc */
9331 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009332 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333};
9334
9335/* Initialize the Unicode implementation */
9336
Thomas Wouters78890102000-07-22 19:25:51 +00009337void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009339 int i;
9340
Thomas Wouters477c8d52006-05-27 19:21:47 +00009341 /* XXX - move this array to unicodectype.c ? */
9342 Py_UNICODE linebreak[] = {
9343 0x000A, /* LINE FEED */
9344 0x000D, /* CARRIAGE RETURN */
9345 0x001C, /* FILE SEPARATOR */
9346 0x001D, /* GROUP SEPARATOR */
9347 0x001E, /* RECORD SEPARATOR */
9348 0x0085, /* NEXT LINE */
9349 0x2028, /* LINE SEPARATOR */
9350 0x2029, /* PARAGRAPH SEPARATOR */
9351 };
9352
Fred Drakee4315f52000-05-09 19:53:39 +00009353 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009354 free_list = NULL;
9355 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009357 if (!unicode_empty)
9358 return;
9359
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009360 for (i = 0; i < 256; i++)
9361 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009362 if (PyType_Ready(&PyUnicode_Type) < 0)
9363 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364
9365 /* initialize the linebreak bloom filter */
9366 bloom_linebreak = make_bloom_mask(
9367 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9368 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009369
9370 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371}
9372
9373/* Finalize the Unicode implementation */
9374
Christian Heimesa156e092008-02-16 07:38:31 +00009375int
9376PyUnicode_ClearFreeList(void)
9377{
9378 int freelist_size = numfree;
9379 PyUnicodeObject *u;
9380
9381 for (u = free_list; u != NULL;) {
9382 PyUnicodeObject *v = u;
9383 u = *(PyUnicodeObject **)u;
9384 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009385 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009386 Py_XDECREF(v->defenc);
9387 PyObject_Del(v);
9388 numfree--;
9389 }
9390 free_list = NULL;
9391 assert(numfree == 0);
9392 return freelist_size;
9393}
9394
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395void
Thomas Wouters78890102000-07-22 19:25:51 +00009396_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009398 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009400 Py_XDECREF(unicode_empty);
9401 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009402
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009403 for (i = 0; i < 256; i++) {
9404 if (unicode_latin1[i]) {
9405 Py_DECREF(unicode_latin1[i]);
9406 unicode_latin1[i] = NULL;
9407 }
9408 }
Christian Heimesa156e092008-02-16 07:38:31 +00009409 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009411
Walter Dörwald16807132007-05-25 13:52:07 +00009412void
9413PyUnicode_InternInPlace(PyObject **p)
9414{
9415 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9416 PyObject *t;
9417 if (s == NULL || !PyUnicode_Check(s))
9418 Py_FatalError(
9419 "PyUnicode_InternInPlace: unicode strings only please!");
9420 /* If it's a subclass, we don't really know what putting
9421 it in the interned dict might do. */
9422 if (!PyUnicode_CheckExact(s))
9423 return;
9424 if (PyUnicode_CHECK_INTERNED(s))
9425 return;
9426 if (interned == NULL) {
9427 interned = PyDict_New();
9428 if (interned == NULL) {
9429 PyErr_Clear(); /* Don't leave an exception */
9430 return;
9431 }
9432 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009433 /* It might be that the GetItem call fails even
9434 though the key is present in the dictionary,
9435 namely when this happens during a stack overflow. */
9436 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009437 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009438 Py_END_ALLOW_RECURSION
9439
Walter Dörwald16807132007-05-25 13:52:07 +00009440 if (t) {
9441 Py_INCREF(t);
9442 Py_DECREF(*p);
9443 *p = t;
9444 return;
9445 }
9446
Martin v. Löwis5b222132007-06-10 09:51:05 +00009447 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009448 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9449 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009450 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009451 return;
9452 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009453 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009454 /* The two references in interned are not counted by refcnt.
9455 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009456 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009457 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9458}
9459
9460void
9461PyUnicode_InternImmortal(PyObject **p)
9462{
9463 PyUnicode_InternInPlace(p);
9464 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9465 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9466 Py_INCREF(*p);
9467 }
9468}
9469
9470PyObject *
9471PyUnicode_InternFromString(const char *cp)
9472{
9473 PyObject *s = PyUnicode_FromString(cp);
9474 if (s == NULL)
9475 return NULL;
9476 PyUnicode_InternInPlace(&s);
9477 return s;
9478}
9479
9480void _Py_ReleaseInternedUnicodeStrings(void)
9481{
9482 PyObject *keys;
9483 PyUnicodeObject *s;
9484 Py_ssize_t i, n;
9485 Py_ssize_t immortal_size = 0, mortal_size = 0;
9486
9487 if (interned == NULL || !PyDict_Check(interned))
9488 return;
9489 keys = PyDict_Keys(interned);
9490 if (keys == NULL || !PyList_Check(keys)) {
9491 PyErr_Clear();
9492 return;
9493 }
9494
9495 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9496 detector, interned unicode strings are not forcibly deallocated;
9497 rather, we give them their stolen references back, and then clear
9498 and DECREF the interned dict. */
9499
9500 n = PyList_GET_SIZE(keys);
9501 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9502 n);
9503 for (i = 0; i < n; i++) {
9504 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9505 switch (s->state) {
9506 case SSTATE_NOT_INTERNED:
9507 /* XXX Shouldn't happen */
9508 break;
9509 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009510 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009511 immortal_size += s->length;
9512 break;
9513 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009514 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009515 mortal_size += s->length;
9516 break;
9517 default:
9518 Py_FatalError("Inconsistent interned string state.");
9519 }
9520 s->state = SSTATE_NOT_INTERNED;
9521 }
9522 fprintf(stderr, "total size of all interned strings: "
9523 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9524 "mortal/immortal\n", mortal_size, immortal_size);
9525 Py_DECREF(keys);
9526 PyDict_Clear(interned);
9527 Py_DECREF(interned);
9528 interned = NULL;
9529}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009530
9531
9532/********************* Unicode Iterator **************************/
9533
9534typedef struct {
9535 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009536 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009537 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9538} unicodeiterobject;
9539
9540static void
9541unicodeiter_dealloc(unicodeiterobject *it)
9542{
9543 _PyObject_GC_UNTRACK(it);
9544 Py_XDECREF(it->it_seq);
9545 PyObject_GC_Del(it);
9546}
9547
9548static int
9549unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9550{
9551 Py_VISIT(it->it_seq);
9552 return 0;
9553}
9554
9555static PyObject *
9556unicodeiter_next(unicodeiterobject *it)
9557{
9558 PyUnicodeObject *seq;
9559 PyObject *item;
9560
9561 assert(it != NULL);
9562 seq = it->it_seq;
9563 if (seq == NULL)
9564 return NULL;
9565 assert(PyUnicode_Check(seq));
9566
9567 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009568 item = PyUnicode_FromUnicode(
9569 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009570 if (item != NULL)
9571 ++it->it_index;
9572 return item;
9573 }
9574
9575 Py_DECREF(seq);
9576 it->it_seq = NULL;
9577 return NULL;
9578}
9579
9580static PyObject *
9581unicodeiter_len(unicodeiterobject *it)
9582{
9583 Py_ssize_t len = 0;
9584 if (it->it_seq)
9585 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009586 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009587}
9588
9589PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9590
9591static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009592 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9593 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009594 {NULL, NULL} /* sentinel */
9595};
9596
9597PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009598 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009599 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009600 sizeof(unicodeiterobject), /* tp_basicsize */
9601 0, /* tp_itemsize */
9602 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009603 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009604 0, /* tp_print */
9605 0, /* tp_getattr */
9606 0, /* tp_setattr */
9607 0, /* tp_compare */
9608 0, /* tp_repr */
9609 0, /* tp_as_number */
9610 0, /* tp_as_sequence */
9611 0, /* tp_as_mapping */
9612 0, /* tp_hash */
9613 0, /* tp_call */
9614 0, /* tp_str */
9615 PyObject_GenericGetAttr, /* tp_getattro */
9616 0, /* tp_setattro */
9617 0, /* tp_as_buffer */
9618 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9619 0, /* tp_doc */
9620 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9621 0, /* tp_clear */
9622 0, /* tp_richcompare */
9623 0, /* tp_weaklistoffset */
9624 PyObject_SelfIter, /* tp_iter */
9625 (iternextfunc)unicodeiter_next, /* tp_iternext */
9626 unicodeiter_methods, /* tp_methods */
9627 0,
9628};
9629
9630static PyObject *
9631unicode_iter(PyObject *seq)
9632{
9633 unicodeiterobject *it;
9634
9635 if (!PyUnicode_Check(seq)) {
9636 PyErr_BadInternalCall();
9637 return NULL;
9638 }
9639 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9640 if (it == NULL)
9641 return NULL;
9642 it->it_index = 0;
9643 Py_INCREF(seq);
9644 it->it_seq = (PyUnicodeObject *)seq;
9645 _PyObject_GC_TRACK(it);
9646 return (PyObject *)it;
9647}
9648
Martin v. Löwis5b222132007-06-10 09:51:05 +00009649size_t
9650Py_UNICODE_strlen(const Py_UNICODE *u)
9651{
9652 int res = 0;
9653 while(*u++)
9654 res++;
9655 return res;
9656}
9657
9658Py_UNICODE*
9659Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9660{
9661 Py_UNICODE *u = s1;
9662 while ((*u++ = *s2++));
9663 return s1;
9664}
9665
9666Py_UNICODE*
9667Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9668{
9669 Py_UNICODE *u = s1;
9670 while ((*u++ = *s2++))
9671 if (n-- == 0)
9672 break;
9673 return s1;
9674}
9675
9676int
9677Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9678{
9679 while (*s1 && *s2 && *s1 == *s2)
9680 s1++, s2++;
9681 if (*s1 && *s2)
9682 return (*s1 < *s2) ? -1 : +1;
9683 if (*s1)
9684 return 1;
9685 if (*s2)
9686 return -1;
9687 return 0;
9688}
9689
9690Py_UNICODE*
9691Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9692{
9693 const Py_UNICODE *p;
9694 for (p = s; *p; p++)
9695 if (*p == c)
9696 return (Py_UNICODE*)p;
9697 return NULL;
9698}
9699
9700
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009701#ifdef __cplusplus
9702}
9703#endif
9704
9705
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009706/*
9707Local variables:
9708c-basic-offset: 4
9709indent-tabs-mode: nil
9710End:
9711*/