blob: 78e38b5e3841a8004b5835e088e2c1a04b8e786d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000315 if (free_list) {
316 unicode = free_list;
317 free_list = *(PyUnicodeObject **)unicode;
318 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Keep-Alive optimization: we only upsize the buffer,
321 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000322 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000323 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000324 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000328 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000335 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 if (unicode == NULL)
338 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 }
342
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000343 if (!unicode->str) {
344 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000345 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000346 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000348 * the caller fails before initializing str -- unicode_resize()
349 * reads str[0], and the Keep-Alive optimization can keep memory
350 * allocated for str alive across a call to unicode_dealloc(unicode).
351 * We don't want unicode_resize to read uninitialized memory in
352 * that case.
353 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000354 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000358 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000359 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000361
362 onError:
363 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000364 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366}
367
368static
Guido van Rossum9475a232001-10-05 20:51:39 +0000369void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370{
Walter Dörwald16807132007-05-25 13:52:07 +0000371 switch (PyUnicode_CHECK_INTERNED(unicode)) {
372 case SSTATE_NOT_INTERNED:
373 break;
374
375 case SSTATE_INTERNED_MORTAL:
376 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000377 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
379 Py_FatalError(
380 "deletion of interned unicode string failed");
381 break;
382
383 case SSTATE_INTERNED_IMMORTAL:
384 Py_FatalError("Immortal interned unicode string died.");
385
386 default:
387 Py_FatalError("Inconsistent interned unicode string state.");
388 }
389
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000394 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 unicode->str = NULL;
396 unicode->length = 0;
397 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000398 if (unicode->defenc) {
399 Py_DECREF(unicode->defenc);
400 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000401 }
402 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000403 *(PyUnicodeObject **)unicode = free_list;
404 free_list = unicode;
405 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406 }
407 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000408 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000409 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000410 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411 }
412}
413
Martin v. Löwis18e16552006-02-15 17:27:45 +0000414int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000439 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 *unicode = (PyObject *)w;
441 return 0;
442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
449/* Internal API for use in unicodeobject.c only ! */
450#define _PyUnicode_Resize(unicodevar, length) \
451 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
452
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455{
456 PyUnicodeObject *unicode;
457
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
462 /* Optimization for empty strings */
463 if (size == 0 && unicode_empty != NULL) {
464 Py_INCREF(unicode_empty);
465 return (PyObject *)unicode_empty;
466 }
467
468 /* Single character Unicode objects in the Latin-1 range are
469 shared when using this constructor */
470 if (size == 1 && *u < 256) {
471 unicode = unicode_latin1[*u];
472 if (!unicode) {
473 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 if (!unicode)
475 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000476 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 unicode_latin1[*u] = unicode;
478 }
479 Py_INCREF(unicode);
480 return (PyObject *)unicode;
481 }
482 }
Tim Petersced69f82003-09-16 20:30:58 +0000483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 unicode = _PyUnicode_New(size);
485 if (!unicode)
486 return NULL;
487
488 /* Copy the Unicode data into the new object */
489 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491
492 return (PyObject *)unicode;
493}
494
Walter Dörwaldd2034312007-05-18 16:29:38 +0000495PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000496{
497 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000498
499 if (size < 0) {
500 PyErr_SetString(PyExc_SystemError,
501 "Negative size passed to PyUnicode_FromStringAndSize");
502 return NULL;
503 }
504
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000506 some optimizations which share commonly used objects.
507 Also, this means the input must be UTF-8, so fall back to the
508 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000509 if (u != NULL) {
510
511 /* Optimization for empty strings */
512 if (size == 0 && unicode_empty != NULL) {
513 Py_INCREF(unicode_empty);
514 return (PyObject *)unicode_empty;
515 }
516
Martin v. Löwis9c121062007-08-05 20:26:11 +0000517 /* Single characters are shared when using this constructor.
518 Restrict to ASCII, since the input must be UTF-8. */
519 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000520 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521 if (!unicode) {
522 unicode = _PyUnicode_New(1);
523 if (!unicode)
524 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000525 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000526 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 }
528 Py_INCREF(unicode);
529 return (PyObject *)unicode;
530 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531
532 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000533 }
534
Walter Dörwald55507312007-05-18 13:12:10 +0000535 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 if (!unicode)
537 return NULL;
538
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 return (PyObject *)unicode;
540}
541
Walter Dörwaldd2034312007-05-18 16:29:38 +0000542PyObject *PyUnicode_FromString(const char *u)
543{
544 size_t size = strlen(u);
545 if (size > PY_SSIZE_T_MAX) {
546 PyErr_SetString(PyExc_OverflowError, "input too long");
547 return NULL;
548 }
549
550 return PyUnicode_FromStringAndSize(u, size);
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553#ifdef HAVE_WCHAR_H
554
555PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000556 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000557{
558 PyUnicodeObject *unicode;
559
560 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000561 if (size == 0)
562 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
Martin v. Löwis790465f2008-04-05 20:41:37 +0000567 if (size == -1) {
568 size = wcslen(w);
569 }
570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 unicode = _PyUnicode_New(size);
572 if (!unicode)
573 return NULL;
574
575 /* Copy the wchar_t data into the new object */
576#ifdef HAVE_USABLE_WCHAR_T
577 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000578#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579 {
580 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000583 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 *u++ = *w++;
585 }
586#endif
587
588 return (PyObject *)unicode;
589}
590
Walter Dörwald346737f2007-05-31 10:44:43 +0000591static void
592makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
593{
594 *fmt++ = '%';
595 if (width) {
596 if (zeropad)
597 *fmt++ = '0';
598 fmt += sprintf(fmt, "%d", width);
599 }
600 if (precision)
601 fmt += sprintf(fmt, ".%d", precision);
602 if (longflag)
603 *fmt++ = 'l';
604 else if (size_tflag) {
605 char *f = PY_FORMAT_SIZE_T;
606 while (*f)
607 *fmt++ = *f++;
608 }
609 *fmt++ = c;
610 *fmt = '\0';
611}
612
Walter Dörwaldd2034312007-05-18 16:29:38 +0000613#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
614
615PyObject *
616PyUnicode_FromFormatV(const char *format, va_list vargs)
617{
618 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000619 Py_ssize_t callcount = 0;
620 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000621 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000623 int width = 0;
624 int precision = 0;
625 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 const char* f;
627 Py_UNICODE *s;
628 PyObject *string;
629 /* used by sprintf */
630 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000631 /* use abuffer instead of buffer, if we need more space
632 * (which can happen if there's a format specifier with width). */
633 char *abuffer = NULL;
634 char *realbuffer;
635 Py_ssize_t abuffersize = 0;
636 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000637 const char *copy;
638
639#ifdef VA_LIST_IS_ARRAY
640 Py_MEMCPY(count, vargs, sizeof(va_list));
641#else
642#ifdef __va_copy
643 __va_copy(count, vargs);
644#else
645 count = vargs;
646#endif
647#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000648 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000649 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000650 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000651 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000652 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000653 ++callcount;
654 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000655 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000656 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000657 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000658 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 if (!callresults) {
660 PyErr_NoMemory();
661 return NULL;
662 }
663 callresult = callresults;
664 }
665 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666 for (f = format; *f; f++) {
667 if (*f == '%') {
668 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000669 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000670 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000671 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000672 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000673 ;
674
675 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
676 * they don't affect the amount of space we reserve.
677 */
678 if ((*f == 'l' || *f == 'z') &&
679 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000680 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681
682 switch (*f) {
683 case 'c':
684 (void)va_arg(count, int);
685 /* fall through... */
686 case '%':
687 n++;
688 break;
689 case 'd': case 'u': case 'i': case 'x':
690 (void) va_arg(count, int);
691 /* 20 bytes is enough to hold a 64-bit
692 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000693 This isn't enough for octal.
694 If a width is specified we need more
695 (which we allocate later). */
696 if (width < 20)
697 width = 20;
698 n += width;
699 if (abuffersize < width)
700 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000701 break;
702 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000703 {
704 /* UTF-8 */
705 unsigned char*s;
706 s = va_arg(count, unsigned char*);
707 while (*s) {
708 if (*s < 128) {
709 n++; s++;
710 } else if (*s < 0xc0) {
711 /* invalid UTF-8 */
712 n++; s++;
713 } else if (*s < 0xc0) {
714 n++;
715 s++; if(!*s)break;
716 s++;
717 } else if (*s < 0xe0) {
718 n++;
719 s++; if(!*s)break;
720 s++; if(!*s)break;
721 s++;
722 } else {
723 #ifdef Py_UNICODE_WIDE
724 n++;
725 #else
726 n+=2;
727 #endif
728 s++; if(!*s)break;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 }
733 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000734 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000735 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 case 'U':
737 {
738 PyObject *obj = va_arg(count, PyObject *);
739 assert(obj && PyUnicode_Check(obj));
740 n += PyUnicode_GET_SIZE(obj);
741 break;
742 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000743 case 'V':
744 {
745 PyObject *obj = va_arg(count, PyObject *);
746 const char *str = va_arg(count, const char *);
747 assert(obj || str);
748 assert(!obj || PyUnicode_Check(obj));
749 if (obj)
750 n += PyUnicode_GET_SIZE(obj);
751 else
752 n += strlen(str);
753 break;
754 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000755 case 'S':
756 {
757 PyObject *obj = va_arg(count, PyObject *);
758 PyObject *str;
759 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000760 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000761 if (!str)
762 goto fail;
763 n += PyUnicode_GET_SIZE(str);
764 /* Remember the str and switch to the next slot */
765 *callresult++ = str;
766 break;
767 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000768 case 'R':
769 {
770 PyObject *obj = va_arg(count, PyObject *);
771 PyObject *repr;
772 assert(obj);
773 repr = PyObject_Repr(obj);
774 if (!repr)
775 goto fail;
776 n += PyUnicode_GET_SIZE(repr);
777 /* Remember the repr and switch to the next slot */
778 *callresult++ = repr;
779 break;
780 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000781 case 'p':
782 (void) va_arg(count, int);
783 /* maximum 64-bit pointer representation:
784 * 0xffffffffffffffff
785 * so 19 characters is enough.
786 * XXX I count 18 -- what's the extra for?
787 */
788 n += 19;
789 break;
790 default:
791 /* if we stumble upon an unknown
792 formatting code, copy the rest of
793 the format string to the output
794 string. (we cannot just skip the
795 code, since there's no way to know
796 what's in the argument list) */
797 n += strlen(p);
798 goto expand;
799 }
800 } else
801 n++;
802 }
803 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000805 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000806 if (!abuffer) {
807 PyErr_NoMemory();
808 goto fail;
809 }
810 realbuffer = abuffer;
811 }
812 else
813 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000815 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 we don't have to resize the string.
817 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 string = PyUnicode_FromUnicode(NULL, n);
819 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821
822 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000823 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000824
825 for (f = format; *f; f++) {
826 if (*f == '%') {
827 const char* p = f++;
828 int longflag = 0;
829 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000830 zeropad = (*f == '0');
831 /* parse the width.precision part */
832 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000833 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000834 width = (width*10) + *f++ - '0';
835 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 if (*f == '.') {
837 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000838 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000839 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000840 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 /* handle the long flag, but only for %ld and %lu.
842 others can be added when necessary. */
843 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
844 longflag = 1;
845 ++f;
846 }
847 /* handle the size_t flag. */
848 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
849 size_tflag = 1;
850 ++f;
851 }
852
853 switch (*f) {
854 case 'c':
855 *s++ = va_arg(vargs, int);
856 break;
857 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000858 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000864 sprintf(realbuffer, fmt, va_arg(vargs, int));
865 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000866 break;
867 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000868 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000870 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000873 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
875 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000876 break;
877 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
879 sprintf(realbuffer, fmt, va_arg(vargs, int));
880 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000881 break;
882 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
884 sprintf(realbuffer, fmt, va_arg(vargs, int));
885 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 break;
887 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000888 {
889 /* Parameter must be UTF-8 encoded.
890 In case of encoding errors, use
891 the replacement character. */
892 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000893 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000894 u = PyUnicode_DecodeUTF8(p, strlen(p),
895 "replace");
896 if (!u)
897 goto fail;
898 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
899 PyUnicode_GET_SIZE(u));
900 s += PyUnicode_GET_SIZE(u);
901 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000903 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 case 'U':
905 {
906 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000907 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
909 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000910 break;
911 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000912 case 'V':
913 {
914 PyObject *obj = va_arg(vargs, PyObject *);
915 const char *str = va_arg(vargs, const char *);
916 if (obj) {
917 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
918 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
919 s += size;
920 } else {
921 appendstring(str);
922 }
923 break;
924 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000925 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000926 case 'R':
927 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000928 Py_UNICODE *ucopy;
929 Py_ssize_t usize;
930 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000931 /* unused, since we already have the result */
932 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000933 ucopy = PyUnicode_AS_UNICODE(*callresult);
934 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000935 for (upos = 0; upos<usize;)
936 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000937 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000938 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000939 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000940 ++callresult;
941 break;
942 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000943 case 'p':
944 sprintf(buffer, "%p", va_arg(vargs, void*));
945 /* %p is ill-defined: ensure leading 0x. */
946 if (buffer[1] == 'X')
947 buffer[1] = 'x';
948 else if (buffer[1] != 'x') {
949 memmove(buffer+2, buffer, strlen(buffer)+1);
950 buffer[0] = '0';
951 buffer[1] = 'x';
952 }
953 appendstring(buffer);
954 break;
955 case '%':
956 *s++ = '%';
957 break;
958 default:
959 appendstring(p);
960 goto end;
961 }
962 } else
963 *s++ = *f;
964 }
965
966 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000967 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000968 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000969 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000970 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
972 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000973 fail:
974 if (callresults) {
975 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000976 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000977 Py_DECREF(*callresult2);
978 ++callresult2;
979 }
Christian Heimesb186d002008-03-18 15:15:01 +0000980 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000981 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000982 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000983 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000984 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000985}
986
987#undef appendstring
988
989PyObject *
990PyUnicode_FromFormat(const char *format, ...)
991{
992 PyObject* ret;
993 va_list vargs;
994
995#ifdef HAVE_STDARG_PROTOTYPES
996 va_start(vargs, format);
997#else
998 va_start(vargs);
999#endif
1000 ret = PyUnicode_FromFormatV(format, vargs);
1001 va_end(vargs);
1002 return ret;
1003}
1004
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1006 wchar_t *w,
1007 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008{
1009 if (unicode == NULL) {
1010 PyErr_BadInternalCall();
1011 return -1;
1012 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001013
1014 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001016 size = PyUnicode_GET_SIZE(unicode) + 1;
1017
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018#ifdef HAVE_USABLE_WCHAR_T
1019 memcpy(w, unicode->str, size * sizeof(wchar_t));
1020#else
1021 {
1022 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001023 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001025 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 *w++ = *u++;
1027 }
1028#endif
1029
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030 if (size > PyUnicode_GET_SIZE(unicode))
1031 return PyUnicode_GET_SIZE(unicode);
1032 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return size;
1034}
1035
1036#endif
1037
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001038PyObject *PyUnicode_FromOrdinal(int ordinal)
1039{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001040 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001041
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001042 if (ordinal < 0 || ordinal > 0x10ffff) {
1043 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001044 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001045 return NULL;
1046 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001047
1048#ifndef Py_UNICODE_WIDE
1049 if (ordinal > 0xffff) {
1050 ordinal -= 0x10000;
1051 s[0] = 0xD800 | (ordinal >> 10);
1052 s[1] = 0xDC00 | (ordinal & 0x3FF);
1053 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054 }
1055#endif
1056
Hye-Shik Chang40574832004-04-06 07:24:51 +00001057 s[0] = (Py_UNICODE)ordinal;
1058 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059}
1060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061PyObject *PyUnicode_FromObject(register PyObject *obj)
1062{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001064 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 if (PyUnicode_CheckExact(obj)) {
1066 Py_INCREF(obj);
1067 return obj;
1068 }
1069 if (PyUnicode_Check(obj)) {
1070 /* For a Unicode subtype that's not a Unicode object,
1071 return a true Unicode object with the same data. */
1072 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1073 PyUnicode_GET_SIZE(obj));
1074 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001075 PyErr_Format(PyExc_TypeError,
1076 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001077 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001078 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001079}
1080
1081PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1082 const char *encoding,
1083 const char *errors)
1084{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001085 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 if (obj == NULL) {
1090 PyErr_BadInternalCall();
1091 return NULL;
1092 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094 if (PyUnicode_Check(obj)) {
1095 PyErr_SetString(PyExc_TypeError,
1096 "decoding Unicode is not supported");
1097 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099
1100 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001101 if (PyBytes_Check(obj)) {
1102 s = PyBytes_AS_STRING(obj);
1103 len = PyBytes_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001105 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1106 /* Overwrite the error message with something more useful in
1107 case of a TypeError. */
1108 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001110 "coercing to Unicode: need string or buffer, "
1111 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001112 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001113 goto onError;
1114 }
Tim Petersced69f82003-09-16 20:30:58 +00001115
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 if (len == 0) {
1118 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 }
Tim Petersced69f82003-09-16 20:30:58 +00001121 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001122 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001123
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124 return v;
1125
1126 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128}
1129
1130PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001131 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001136 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001137 char lower[20]; /* Enough for any encoding name we recognize */
1138 char *l;
1139 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001140
1141 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001142 encoding = PyUnicode_GetDefaultEncoding();
1143
1144 /* Convert encoding to lower case and replace '_' with '-' in order to
1145 catch e.g. UTF_8 */
1146 e = encoding;
1147 l = lower;
1148 while (*e && l < &lower[(sizeof lower) - 2]) {
1149 if (ISUPPER(*e)) {
1150 *l++ = TOLOWER(*e++);
1151 }
1152 else if (*e == '_') {
1153 *l++ = '-';
1154 e++;
1155 }
1156 else {
1157 *l++ = *e++;
1158 }
1159 }
1160 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001161
1162 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001163 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001165 else if ((strcmp(lower, "latin-1") == 0) ||
1166 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001167 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001168#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001170 return PyUnicode_DecodeMBCS(s, size, errors);
1171#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001172 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001173 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001174 else if (strcmp(lower, "utf-16") == 0)
1175 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1176 else if (strcmp(lower, "utf-32") == 0)
1177 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178
1179 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001180 buffer = NULL;
1181 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1182 goto onError;
1183 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 if (buffer == NULL)
1185 goto onError;
1186 unicode = PyCodec_Decode(buffer, encoding, errors);
1187 if (unicode == NULL)
1188 goto onError;
1189 if (!PyUnicode_Check(unicode)) {
1190 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001191 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001192 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 Py_DECREF(unicode);
1194 goto onError;
1195 }
1196 Py_DECREF(buffer);
1197 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 onError:
1200 Py_XDECREF(buffer);
1201 return NULL;
1202}
1203
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001204PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1205 const char *encoding,
1206 const char *errors)
1207{
1208 PyObject *v;
1209
1210 if (!PyUnicode_Check(unicode)) {
1211 PyErr_BadArgument();
1212 goto onError;
1213 }
1214
1215 if (encoding == NULL)
1216 encoding = PyUnicode_GetDefaultEncoding();
1217
1218 /* Decode via the codec registry */
1219 v = PyCodec_Decode(unicode, encoding, errors);
1220 if (v == NULL)
1221 goto onError;
1222 return v;
1223
1224 onError:
1225 return NULL;
1226}
1227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 const char *encoding,
1231 const char *errors)
1232{
1233 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001234
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 unicode = PyUnicode_FromUnicode(s, size);
1236 if (unicode == NULL)
1237 return NULL;
1238 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1239 Py_DECREF(unicode);
1240 return v;
1241}
1242
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1244 const char *encoding,
1245 const char *errors)
1246{
1247 PyObject *v;
1248
1249 if (!PyUnicode_Check(unicode)) {
1250 PyErr_BadArgument();
1251 goto onError;
1252 }
1253
1254 if (encoding == NULL)
1255 encoding = PyUnicode_GetDefaultEncoding();
1256
1257 /* Encode via the codec registry */
1258 v = PyCodec_Encode(unicode, encoding, errors);
1259 if (v == NULL)
1260 goto onError;
1261 return v;
1262
1263 onError:
1264 return NULL;
1265}
1266
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1268 const char *encoding,
1269 const char *errors)
1270{
1271 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_BadArgument();
1275 goto onError;
1276 }
Fred Drakee4315f52000-05-09 19:53:39 +00001277
Tim Petersced69f82003-09-16 20:30:58 +00001278 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001279 encoding = PyUnicode_GetDefaultEncoding();
1280
1281 /* Shortcuts for common default encodings */
1282 if (errors == NULL) {
1283 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001284 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001285 else if (strcmp(encoding, "latin-1") == 0)
1286 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001287#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1288 else if (strcmp(encoding, "mbcs") == 0)
1289 return PyUnicode_AsMBCSString(unicode);
1290#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001291 else if (strcmp(encoding, "ascii") == 0)
1292 return PyUnicode_AsASCIIString(unicode);
1293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294
1295 /* Encode via the codec registry */
1296 v = PyCodec_Encode(unicode, encoding, errors);
1297 if (v == NULL)
1298 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001299 assert(PyBytes_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001301
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 onError:
1303 return NULL;
1304}
1305
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001306PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1307 const char *errors)
1308{
1309 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001310 if (v)
1311 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001312 if (errors != NULL)
1313 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001314 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001315 PyUnicode_GET_SIZE(unicode),
1316 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001317 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001318 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001319 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001320 return v;
1321}
1322
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001323PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001324PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001325 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001326 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1327}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001328
Christian Heimes5894ba72007-11-04 11:43:14 +00001329PyObject*
1330PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1331{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001332 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1333 can be undefined. If it is case, decode using UTF-8. The following assumes
1334 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1335 bootstrapping process where the codecs aren't ready yet.
1336 */
1337 if (Py_FileSystemDefaultEncoding) {
1338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001339 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001340 return PyUnicode_DecodeMBCS(s, size, "replace");
1341 }
1342#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001343 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001344 return PyUnicode_DecodeUTF8(s, size, "replace");
1345 }
1346#endif
1347 return PyUnicode_Decode(s, size,
1348 Py_FileSystemDefaultEncoding,
1349 "replace");
1350 }
1351 else {
1352 return PyUnicode_DecodeUTF8(s, size, "replace");
1353 }
1354}
1355
Martin v. Löwis5b222132007-06-10 09:51:05 +00001356char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001357PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001358{
Christian Heimesf3863112007-11-22 07:46:41 +00001359 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 return NULL;
1363 }
Christian Heimesf3863112007-11-22 07:46:41 +00001364 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1365 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001366 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001367 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001368 *psize = PyBytes_GET_SIZE(bytes);
1369 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001370}
1371
1372char*
1373PyUnicode_AsString(PyObject *unicode)
1374{
1375 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1379{
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 goto onError;
1383 }
1384 return PyUnicode_AS_UNICODE(unicode);
1385
1386 onError:
1387 return NULL;
1388}
1389
Martin v. Löwis18e16552006-02-15 17:27:45 +00001390Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391{
1392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_BadArgument();
1394 goto onError;
1395 }
1396 return PyUnicode_GET_SIZE(unicode);
1397
1398 onError:
1399 return -1;
1400}
1401
Thomas Wouters78890102000-07-22 19:25:51 +00001402const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001403{
1404 return unicode_default_encoding;
1405}
1406
1407int PyUnicode_SetDefaultEncoding(const char *encoding)
1408{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001409 if (strcmp(encoding, unicode_default_encoding) != 0) {
1410 PyErr_Format(PyExc_ValueError,
1411 "Can only set default encoding to %s",
1412 unicode_default_encoding);
1413 return -1;
1414 }
Fred Drakee4315f52000-05-09 19:53:39 +00001415 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001416}
1417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418/* error handling callback helper:
1419 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001420 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001421 and adjust various state variables.
1422 return 0 on success, -1 on error
1423*/
1424
1425static
1426int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1427 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001428 const char **input, const char **inend, Py_ssize_t *startinpos,
1429 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001430 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433
1434 PyObject *restuple = NULL;
1435 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001437 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001438 Py_ssize_t requiredsize;
1439 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001441 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001442 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 int res = -1;
1444
1445 if (*errorHandler == NULL) {
1446 *errorHandler = PyCodec_LookupError(errors);
1447 if (*errorHandler == NULL)
1448 goto onError;
1449 }
1450
1451 if (*exceptionObject == NULL) {
1452 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001453 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 if (*exceptionObject == NULL)
1455 goto onError;
1456 }
1457 else {
1458 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1459 goto onError;
1460 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1461 goto onError;
1462 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1463 goto onError;
1464 }
1465
1466 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1467 if (restuple == NULL)
1468 goto onError;
1469 if (!PyTuple_Check(restuple)) {
1470 PyErr_Format(PyExc_TypeError, &argparse[4]);
1471 goto onError;
1472 }
1473 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1474 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001475
1476 /* Copy back the bytes variables, which might have been modified by the
1477 callback */
1478 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1479 if (!inputobj)
1480 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001481 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001482 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1483 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001484 *input = PyBytes_AS_STRING(inputobj);
1485 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001486 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001487 /* we can DECREF safely, as the exception has another reference,
1488 so the object won't go away. */
1489 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001492 newpos = insize+newpos;
1493 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001494 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001495 goto onError;
1496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497
1498 /* need more space? (at least enough for what we
1499 have+the replacement+the rest of the string (starting
1500 at the new input position), so we won't have to check space
1501 when there are no errors in the rest of the string) */
1502 repptr = PyUnicode_AS_UNICODE(repunicode);
1503 repsize = PyUnicode_GET_SIZE(repunicode);
1504 requiredsize = *outpos + repsize + insize-newpos;
1505 if (requiredsize > outsize) {
1506 if (requiredsize<2*outsize)
1507 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001508 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 goto onError;
1510 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1511 }
1512 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001513 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001514 Py_UNICODE_COPY(*outptr, repptr, repsize);
1515 *outptr += repsize;
1516 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 /* we made it! */
1519 res = 0;
1520
1521 onError:
1522 Py_XDECREF(restuple);
1523 return res;
1524}
1525
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526/* --- UTF-7 Codec -------------------------------------------------------- */
1527
1528/* see RFC2152 for details */
1529
Tim Petersced69f82003-09-16 20:30:58 +00001530static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531char utf7_special[128] = {
1532 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1533 encoded:
1534 0 - not special
1535 1 - special
1536 2 - whitespace (optional)
1537 3 - RFC2152 Set O (optional) */
1538 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1540 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1542 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1544 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1546
1547};
1548
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001549/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1550 warnings about the comparison always being false; since
1551 utf7_special[0] is 1, we can safely make that one comparison
1552 true */
1553
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001555 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001556 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 (encodeO && (utf7_special[(c)] == 3)))
1558
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001559#define B64(n) \
1560 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1561#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001562 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001563#define UB64(c) \
1564 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1565 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001567#define ENCODE(out, ch, bits) \
1568 while (bits >= 6) { \
1569 *out++ = B64(ch >> (bits-6)); \
1570 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571 }
1572
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001573#define DECODE(out, ch, bits, surrogate) \
1574 while (bits >= 16) { \
1575 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1576 bits -= 16; \
1577 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001578 /* We have already generated an error for the high surrogate \
1579 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001580 surrogate = 0; \
1581 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001583 it in a 16-bit character */ \
1584 surrogate = 1; \
1585 errmsg = "code pairs are not supported"; \
1586 goto utf7Error; \
1587 } else { \
1588 *out++ = outCh; \
1589 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001593 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 const char *errors)
1595{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001596 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1597}
1598
1599PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1600 Py_ssize_t size,
1601 const char *errors,
1602 Py_ssize_t *consumed)
1603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001605 Py_ssize_t startinpos;
1606 Py_ssize_t endinpos;
1607 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 const char *e;
1609 PyUnicodeObject *unicode;
1610 Py_UNICODE *p;
1611 const char *errmsg = "";
1612 int inShift = 0;
1613 unsigned int bitsleft = 0;
1614 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 int surrogate = 0;
1616 PyObject *errorHandler = NULL;
1617 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618
1619 unicode = _PyUnicode_New(size);
1620 if (!unicode)
1621 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001622 if (size == 0) {
1623 if (consumed)
1624 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001626 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627
1628 p = unicode->str;
1629 e = s + size;
1630
1631 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001632 Py_UNICODE ch;
1633 restart:
1634 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635
1636 if (inShift) {
1637 if ((ch == '-') || !B64CHAR(ch)) {
1638 inShift = 0;
1639 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001640
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1642 if (bitsleft >= 6) {
1643 /* The shift sequence has a partial character in it. If
1644 bitsleft < 6 then we could just classify it as padding
1645 but that is not the case here */
1646
1647 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001648 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 }
1650 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001651 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 here so indicate the potential of a misencoded character. */
1653
1654 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1655 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1656 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001657 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 }
1659
1660 if (ch == '-') {
1661 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001662 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 inShift = 1;
1664 }
1665 } else if (SPECIAL(ch,0,0)) {
1666 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001667 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 } else {
1669 *p++ = ch;
1670 }
1671 } else {
1672 charsleft = (charsleft << 6) | UB64(ch);
1673 bitsleft += 6;
1674 s++;
1675 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1676 }
1677 }
1678 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 s++;
1681 if (s < e && *s == '-') {
1682 s++;
1683 *p++ = '+';
1684 } else
1685 {
1686 inShift = 1;
1687 bitsleft = 0;
1688 }
1689 }
1690 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001691 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 errmsg = "unexpected special character";
1693 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001694 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696 else {
1697 *p++ = ch;
1698 s++;
1699 }
1700 continue;
1701 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 outpos = p-PyUnicode_AS_UNICODE(unicode);
1703 endinpos = s-starts;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001707 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001708 (PyObject **)&unicode, &outpos, &p))
1709 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001710 }
1711
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001712 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 outpos = p-PyUnicode_AS_UNICODE(unicode);
1714 endinpos = size;
1715 if (unicode_decode_call_errorhandler(
1716 errors, &errorHandler,
1717 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001718 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001720 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 if (s < e)
1722 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001724 if (consumed) {
1725 if(inShift)
1726 *consumed = startinpos;
1727 else
1728 *consumed = s-starts;
1729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001731 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 goto onError;
1733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 Py_XDECREF(errorHandler);
1735 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001736 return (PyObject *)unicode;
1737
1738onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 Py_XDECREF(errorHandler);
1740 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001741 Py_DECREF(unicode);
1742 return NULL;
1743}
1744
1745
1746PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001747 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 int encodeSetO,
1749 int encodeWhiteSpace,
1750 const char *errors)
1751{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001752 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001754 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001756 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 unsigned int bitsleft = 0;
1758 unsigned long charsleft = 0;
1759 char * out;
1760 char * start;
1761
1762 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001763 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764
Christian Heimes9c4756e2008-05-26 13:22:05 +00001765 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 if (v == NULL)
1767 return NULL;
1768
Christian Heimes9c4756e2008-05-26 13:22:05 +00001769 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770 for (;i < size; ++i) {
1771 Py_UNICODE ch = s[i];
1772
1773 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001774 if (ch == '+') {
1775 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 *out++ = '-';
1777 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1778 charsleft = ch;
1779 bitsleft = 16;
1780 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001781 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001783 } else {
1784 *out++ = (char) ch;
1785 }
1786 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1788 *out++ = B64(charsleft << (6-bitsleft));
1789 charsleft = 0;
1790 bitsleft = 0;
1791 /* Characters not in the BASE64 set implicitly unshift the sequence
1792 so no '-' is required, except if the character is itself a '-' */
1793 if (B64CHAR(ch) || ch == '-') {
1794 *out++ = '-';
1795 }
1796 inShift = 0;
1797 *out++ = (char) ch;
1798 } else {
1799 bitsleft += 16;
1800 charsleft = (charsleft << 16) | ch;
1801 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1802
1803 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001804 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805 or '-' then the shift sequence will be terminated implicitly and we
1806 don't have to insert a '-'. */
1807
1808 if (bitsleft == 0) {
1809 if (i + 1 < size) {
1810 Py_UNICODE ch2 = s[i+1];
1811
1812 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001813
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814 } else if (B64CHAR(ch2) || ch2 == '-') {
1815 *out++ = '-';
1816 inShift = 0;
1817 } else {
1818 inShift = 0;
1819 }
1820
1821 }
1822 else {
1823 *out++ = '-';
1824 inShift = 0;
1825 }
1826 }
Tim Petersced69f82003-09-16 20:30:58 +00001827 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001829 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 if (bitsleft) {
1831 *out++= B64(charsleft << (6-bitsleft) );
1832 *out++ = '-';
1833 }
1834
Christian Heimes72b710a2008-05-26 13:28:38 +00001835 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001836 Py_DECREF(v);
1837 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838}
1839
1840#undef SPECIAL
1841#undef B64
1842#undef B64CHAR
1843#undef UB64
1844#undef ENCODE
1845#undef DECODE
1846
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847/* --- UTF-8 Codec -------------------------------------------------------- */
1848
Tim Petersced69f82003-09-16 20:30:58 +00001849static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850char utf8_code_length[256] = {
1851 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1852 illegal prefix. see RFC 2279 for details */
1853 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1868 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1869};
1870
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001872 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 const char *errors)
1874{
Walter Dörwald69652032004-09-07 20:24:22 +00001875 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1876}
1877
1878PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001879 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001880 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001881 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001885 Py_ssize_t startinpos;
1886 Py_ssize_t endinpos;
1887 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 const char *e;
1889 PyUnicodeObject *unicode;
1890 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001891 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 PyObject *errorHandler = NULL;
1893 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
1895 /* Note: size will always be longer than the resulting Unicode
1896 character count */
1897 unicode = _PyUnicode_New(size);
1898 if (!unicode)
1899 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001900 if (size == 0) {
1901 if (consumed)
1902 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905
1906 /* Unpack UTF-8 encoded data */
1907 p = unicode->str;
1908 e = s + size;
1909
1910 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912
1913 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001914 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 s++;
1916 continue;
1917 }
1918
1919 n = utf8_code_length[ch];
1920
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001921 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001922 if (consumed)
1923 break;
1924 else {
1925 errmsg = "unexpected end of data";
1926 startinpos = s-starts;
1927 endinpos = size;
1928 goto utf8Error;
1929 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931
1932 switch (n) {
1933
1934 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001935 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 startinpos = s-starts;
1937 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001938 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939
1940 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001941 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001942 startinpos = s-starts;
1943 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001944 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001947 if ((s[1] & 0xc0) != 0x80) {
1948 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001949 startinpos = s-starts;
1950 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001951 goto utf8Error;
1952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001954 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 startinpos = s-starts;
1956 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001957 errmsg = "illegal encoding";
1958 goto utf8Error;
1959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001961 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 break;
1963
1964 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001965 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001966 (s[2] & 0xc0) != 0x80) {
1967 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 startinpos = s-starts;
1969 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001970 goto utf8Error;
1971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001973 if (ch < 0x0800) {
1974 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001975 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001976
1977 XXX For wide builds (UCS-4) we should probably try
1978 to recombine the surrogates into a single code
1979 unit.
1980 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001981 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982 startinpos = s-starts;
1983 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001984 goto utf8Error;
1985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001987 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 break;
1989
1990 case 4:
1991 if ((s[1] & 0xc0) != 0x80 ||
1992 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001993 (s[3] & 0xc0) != 0x80) {
1994 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 startinpos = s-starts;
1996 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001997 goto utf8Error;
1998 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001999 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2000 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2001 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002002 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002003 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002005 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002007 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 startinpos = s-starts;
2009 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002010 goto utf8Error;
2011 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002012#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002013 *p++ = (Py_UNICODE)ch;
2014#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002015 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002016
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 /* translate from 10000..10FFFF to 0..FFFF */
2018 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002019
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 /* high surrogate = top 10 bits added to D800 */
2021 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002022
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002024 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 break;
2027
2028 default:
2029 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002030 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 startinpos = s-starts;
2032 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 }
2035 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002036 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002038 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 outpos = p-PyUnicode_AS_UNICODE(unicode);
2040 if (unicode_decode_call_errorhandler(
2041 errors, &errorHandler,
2042 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002043 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 (PyObject **)&unicode, &outpos, &p))
2045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 }
Walter Dörwald69652032004-09-07 20:24:22 +00002047 if (consumed)
2048 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
2050 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002051 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 goto onError;
2053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)unicode;
2057
2058onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 Py_XDECREF(errorHandler);
2060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 Py_DECREF(unicode);
2062 return NULL;
2063}
2064
Tim Peters602f7402002-04-27 18:03:26 +00002065/* Allocation strategy: if the string is short, convert into a stack buffer
2066 and allocate exactly as much space needed at the end. Else allocate the
2067 maximum possible needed (4 result bytes per Unicode character), and return
2068 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002069*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002070PyObject *
2071PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002072 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002073 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074{
Tim Peters602f7402002-04-27 18:03:26 +00002075#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002076
Guido van Rossum98297ee2007-11-06 21:34:58 +00002077 Py_ssize_t i; /* index into s of next input byte */
2078 PyObject *result; /* result string object */
2079 char *p; /* next free byte in output buffer */
2080 Py_ssize_t nallocated; /* number of result bytes allocated */
2081 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002082 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002083
Tim Peters602f7402002-04-27 18:03:26 +00002084 assert(s != NULL);
2085 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086
Tim Peters602f7402002-04-27 18:03:26 +00002087 if (size <= MAX_SHORT_UNICHARS) {
2088 /* Write into the stack buffer; nallocated can't overflow.
2089 * At the end, we'll allocate exactly as much heap space as it
2090 * turns out we need.
2091 */
2092 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002093 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002094 p = stackbuf;
2095 }
2096 else {
2097 /* Overallocate on the heap, and give the excess back at the end. */
2098 nallocated = size * 4;
2099 if (nallocated / 4 != size) /* overflow! */
2100 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002101 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002102 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002103 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002104 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002105 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002106
Tim Peters602f7402002-04-27 18:03:26 +00002107 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002108 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002109
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002110 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002111 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002113
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002115 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002116 *p++ = (char)(0xc0 | (ch >> 6));
2117 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002118 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002119 else {
Tim Peters602f7402002-04-27 18:03:26 +00002120 /* Encode UCS2 Unicode ordinals */
2121 if (ch < 0x10000) {
2122 /* Special case: check for high surrogate */
2123 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2124 Py_UCS4 ch2 = s[i];
2125 /* Check for low surrogate and combine the two to
2126 form a UCS4 value */
2127 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002128 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002129 i++;
2130 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002131 }
Tim Peters602f7402002-04-27 18:03:26 +00002132 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002134 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002135 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2136 *p++ = (char)(0x80 | (ch & 0x3f));
2137 continue;
2138 }
2139encodeUCS4:
2140 /* Encode UCS4 Unicode ordinals */
2141 *p++ = (char)(0xf0 | (ch >> 18));
2142 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002147
Guido van Rossum98297ee2007-11-06 21:34:58 +00002148 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002149 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002150 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002151 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002152 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002153 }
2154 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002155 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002156 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002157 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002158 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002159 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002160 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002161
Tim Peters602f7402002-04-27 18:03:26 +00002162#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163}
2164
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2166{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 if (!PyUnicode_Check(unicode)) {
2168 PyErr_BadArgument();
2169 return NULL;
2170 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002171 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2172 PyUnicode_GET_SIZE(unicode),
2173 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174}
2175
Walter Dörwald41980ca2007-08-16 21:55:45 +00002176/* --- UTF-32 Codec ------------------------------------------------------- */
2177
2178PyObject *
2179PyUnicode_DecodeUTF32(const char *s,
2180 Py_ssize_t size,
2181 const char *errors,
2182 int *byteorder)
2183{
2184 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2185}
2186
2187PyObject *
2188PyUnicode_DecodeUTF32Stateful(const char *s,
2189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder,
2192 Py_ssize_t *consumed)
2193{
2194 const char *starts = s;
2195 Py_ssize_t startinpos;
2196 Py_ssize_t endinpos;
2197 Py_ssize_t outpos;
2198 PyUnicodeObject *unicode;
2199 Py_UNICODE *p;
2200#ifndef Py_UNICODE_WIDE
2201 int i, pairs;
2202#else
2203 const int pairs = 0;
2204#endif
2205 const unsigned char *q, *e;
2206 int bo = 0; /* assume native ordering by default */
2207 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002208 /* Offsets from q for retrieving bytes in the right order. */
2209#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2210 int iorder[] = {0, 1, 2, 3};
2211#else
2212 int iorder[] = {3, 2, 1, 0};
2213#endif
2214 PyObject *errorHandler = NULL;
2215 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002216 /* On narrow builds we split characters outside the BMP into two
2217 codepoints => count how much extra space we need. */
2218#ifndef Py_UNICODE_WIDE
2219 for (i = pairs = 0; i < size/4; i++)
2220 if (((Py_UCS4 *)s)[i] >= 0x10000)
2221 pairs++;
2222#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002223
2224 /* This might be one to much, because of a BOM */
2225 unicode = _PyUnicode_New((size+3)/4+pairs);
2226 if (!unicode)
2227 return NULL;
2228 if (size == 0)
2229 return (PyObject *)unicode;
2230
2231 /* Unpack UTF-32 encoded data */
2232 p = unicode->str;
2233 q = (unsigned char *)s;
2234 e = q + size;
2235
2236 if (byteorder)
2237 bo = *byteorder;
2238
2239 /* Check for BOM marks (U+FEFF) in the input and adjust current
2240 byte order setting accordingly. In native mode, the leading BOM
2241 mark is skipped, in all other modes, it is copied to the output
2242 stream as-is (giving a ZWNBSP character). */
2243 if (bo == 0) {
2244 if (size >= 4) {
2245 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2246 (q[iorder[1]] << 8) | q[iorder[0]];
2247#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2248 if (bom == 0x0000FEFF) {
2249 q += 4;
2250 bo = -1;
2251 }
2252 else if (bom == 0xFFFE0000) {
2253 q += 4;
2254 bo = 1;
2255 }
2256#else
2257 if (bom == 0x0000FEFF) {
2258 q += 4;
2259 bo = 1;
2260 }
2261 else if (bom == 0xFFFE0000) {
2262 q += 4;
2263 bo = -1;
2264 }
2265#endif
2266 }
2267 }
2268
2269 if (bo == -1) {
2270 /* force LE */
2271 iorder[0] = 0;
2272 iorder[1] = 1;
2273 iorder[2] = 2;
2274 iorder[3] = 3;
2275 }
2276 else if (bo == 1) {
2277 /* force BE */
2278 iorder[0] = 3;
2279 iorder[1] = 2;
2280 iorder[2] = 1;
2281 iorder[3] = 0;
2282 }
2283
2284 while (q < e) {
2285 Py_UCS4 ch;
2286 /* remaining bytes at the end? (size should be divisible by 4) */
2287 if (e-q<4) {
2288 if (consumed)
2289 break;
2290 errmsg = "truncated data";
2291 startinpos = ((const char *)q)-starts;
2292 endinpos = ((const char *)e)-starts;
2293 goto utf32Error;
2294 /* The remaining input chars are ignored if the callback
2295 chooses to skip the input */
2296 }
2297 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2298 (q[iorder[1]] << 8) | q[iorder[0]];
2299
2300 if (ch >= 0x110000)
2301 {
2302 errmsg = "codepoint not in range(0x110000)";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = startinpos+4;
2305 goto utf32Error;
2306 }
2307#ifndef Py_UNICODE_WIDE
2308 if (ch >= 0x10000)
2309 {
2310 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2311 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2312 }
2313 else
2314#endif
2315 *p++ = ch;
2316 q += 4;
2317 continue;
2318 utf32Error:
2319 outpos = p-PyUnicode_AS_UNICODE(unicode);
2320 if (unicode_decode_call_errorhandler(
2321 errors, &errorHandler,
2322 "utf32", errmsg,
2323 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2324 (PyObject **)&unicode, &outpos, &p))
2325 goto onError;
2326 }
2327
2328 if (byteorder)
2329 *byteorder = bo;
2330
2331 if (consumed)
2332 *consumed = (const char *)q-starts;
2333
2334 /* Adjust length */
2335 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2336 goto onError;
2337
2338 Py_XDECREF(errorHandler);
2339 Py_XDECREF(exc);
2340 return (PyObject *)unicode;
2341
2342onError:
2343 Py_DECREF(unicode);
2344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
2346 return NULL;
2347}
2348
2349PyObject *
2350PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2351 Py_ssize_t size,
2352 const char *errors,
2353 int byteorder)
2354{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002355 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002356 unsigned char *p;
2357#ifndef Py_UNICODE_WIDE
2358 int i, pairs;
2359#else
2360 const int pairs = 0;
2361#endif
2362 /* Offsets from p for storing byte pairs in the right order. */
2363#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2364 int iorder[] = {0, 1, 2, 3};
2365#else
2366 int iorder[] = {3, 2, 1, 0};
2367#endif
2368
2369#define STORECHAR(CH) \
2370 do { \
2371 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2372 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2373 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2374 p[iorder[0]] = (CH) & 0xff; \
2375 p += 4; \
2376 } while(0)
2377
2378 /* In narrow builds we can output surrogate pairs as one codepoint,
2379 so we need less space. */
2380#ifndef Py_UNICODE_WIDE
2381 for (i = pairs = 0; i < size-1; i++)
2382 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2383 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2384 pairs++;
2385#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002386 v = PyByteArray_FromStringAndSize(NULL,
Walter Dörwald41980ca2007-08-16 21:55:45 +00002387 4 * (size - pairs + (byteorder == 0)));
2388 if (v == NULL)
2389 return NULL;
2390
Christian Heimes9c4756e2008-05-26 13:22:05 +00002391 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002392 if (byteorder == 0)
2393 STORECHAR(0xFEFF);
2394 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002395 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002396
2397 if (byteorder == -1) {
2398 /* force LE */
2399 iorder[0] = 0;
2400 iorder[1] = 1;
2401 iorder[2] = 2;
2402 iorder[3] = 3;
2403 }
2404 else if (byteorder == 1) {
2405 /* force BE */
2406 iorder[0] = 3;
2407 iorder[1] = 2;
2408 iorder[2] = 1;
2409 iorder[3] = 0;
2410 }
2411
2412 while (size-- > 0) {
2413 Py_UCS4 ch = *s++;
2414#ifndef Py_UNICODE_WIDE
2415 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2416 Py_UCS4 ch2 = *s;
2417 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2418 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2419 s++;
2420 size--;
2421 }
2422 }
2423#endif
2424 STORECHAR(ch);
2425 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002426
2427 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002428 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002429 Py_DECREF(v);
2430 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002431#undef STORECHAR
2432}
2433
2434PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2435{
2436 if (!PyUnicode_Check(unicode)) {
2437 PyErr_BadArgument();
2438 return NULL;
2439 }
2440 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2441 PyUnicode_GET_SIZE(unicode),
2442 NULL,
2443 0);
2444}
2445
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446/* --- UTF-16 Codec ------------------------------------------------------- */
2447
Tim Peters772747b2001-08-09 22:21:55 +00002448PyObject *
2449PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002450 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002451 const char *errors,
2452 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453{
Walter Dörwald69652032004-09-07 20:24:22 +00002454 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2455}
2456
2457PyObject *
2458PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002460 const char *errors,
2461 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002462 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002465 Py_ssize_t startinpos;
2466 Py_ssize_t endinpos;
2467 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 PyUnicodeObject *unicode;
2469 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002470 const unsigned char *q, *e;
2471 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002472 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002473 /* Offsets from q for retrieving byte pairs in the right order. */
2474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2475 int ihi = 1, ilo = 0;
2476#else
2477 int ihi = 0, ilo = 1;
2478#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 PyObject *errorHandler = NULL;
2480 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481
2482 /* Note: size will always be longer than the resulting Unicode
2483 character count */
2484 unicode = _PyUnicode_New(size);
2485 if (!unicode)
2486 return NULL;
2487 if (size == 0)
2488 return (PyObject *)unicode;
2489
2490 /* Unpack UTF-16 encoded data */
2491 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002492 q = (unsigned char *)s;
2493 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494
2495 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002496 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002498 /* Check for BOM marks (U+FEFF) in the input and adjust current
2499 byte order setting accordingly. In native mode, the leading BOM
2500 mark is skipped, in all other modes, it is copied to the output
2501 stream as-is (giving a ZWNBSP character). */
2502 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002503 if (size >= 2) {
2504 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002505#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002506 if (bom == 0xFEFF) {
2507 q += 2;
2508 bo = -1;
2509 }
2510 else if (bom == 0xFFFE) {
2511 q += 2;
2512 bo = 1;
2513 }
Tim Petersced69f82003-09-16 20:30:58 +00002514#else
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (bom == 0xFEFF) {
2516 q += 2;
2517 bo = 1;
2518 }
2519 else if (bom == 0xFFFE) {
2520 q += 2;
2521 bo = -1;
2522 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002523#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
Tim Peters772747b2001-08-09 22:21:55 +00002527 if (bo == -1) {
2528 /* force LE */
2529 ihi = 1;
2530 ilo = 0;
2531 }
2532 else if (bo == 1) {
2533 /* force BE */
2534 ihi = 0;
2535 ilo = 1;
2536 }
2537
2538 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002540 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002542 if (consumed)
2543 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 errmsg = "truncated data";
2545 startinpos = ((const char *)q)-starts;
2546 endinpos = ((const char *)e)-starts;
2547 goto utf16Error;
2548 /* The remaining input chars are ignored if the callback
2549 chooses to skip the input */
2550 }
2551 ch = (q[ihi] << 8) | q[ilo];
2552
Tim Peters772747b2001-08-09 22:21:55 +00002553 q += 2;
2554
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 if (ch < 0xD800 || ch > 0xDFFF) {
2556 *p++ = ch;
2557 continue;
2558 }
2559
2560 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002561 if (q >= e) {
2562 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 startinpos = (((const char *)q)-2)-starts;
2564 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002565 goto utf16Error;
2566 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002567 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002568 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2569 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002570 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002571#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002572 *p++ = ch;
2573 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574#else
2575 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002577 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578 }
2579 else {
2580 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 startinpos = (((const char *)q)-4)-starts;
2582 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002583 goto utf16Error;
2584 }
2585
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002587 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 startinpos = (((const char *)q)-2)-starts;
2589 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002590 /* Fall through to report the error */
2591
2592 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 outpos = p-PyUnicode_AS_UNICODE(unicode);
2594 if (unicode_decode_call_errorhandler(
2595 errors, &errorHandler,
2596 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002597 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002598 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002599 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 }
2601
2602 if (byteorder)
2603 *byteorder = bo;
2604
Walter Dörwald69652032004-09-07 20:24:22 +00002605 if (consumed)
2606 *consumed = (const char *)q-starts;
2607
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002609 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 goto onError;
2611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 Py_XDECREF(errorHandler);
2613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 return (PyObject *)unicode;
2615
2616onError:
2617 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 Py_XDECREF(errorHandler);
2619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 return NULL;
2621}
2622
Tim Peters772747b2001-08-09 22:21:55 +00002623PyObject *
2624PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002625 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002626 const char *errors,
2627 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002629 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002630 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002631#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002632 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002633#else
2634 const int pairs = 0;
2635#endif
Tim Peters772747b2001-08-09 22:21:55 +00002636 /* Offsets from p for storing byte pairs in the right order. */
2637#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2638 int ihi = 1, ilo = 0;
2639#else
2640 int ihi = 0, ilo = 1;
2641#endif
2642
2643#define STORECHAR(CH) \
2644 do { \
2645 p[ihi] = ((CH) >> 8) & 0xff; \
2646 p[ilo] = (CH) & 0xff; \
2647 p += 2; \
2648 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002651 for (i = pairs = 0; i < size; i++)
2652 if (s[i] >= 0x10000)
2653 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002654#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002655 v = PyByteArray_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002656 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 if (v == NULL)
2658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659
Christian Heimes9c4756e2008-05-26 13:22:05 +00002660 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002662 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002663 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002664 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002665
2666 if (byteorder == -1) {
2667 /* force LE */
2668 ihi = 1;
2669 ilo = 0;
2670 }
2671 else if (byteorder == 1) {
2672 /* force BE */
2673 ihi = 0;
2674 ilo = 1;
2675 }
2676
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002677 while (size-- > 0) {
2678 Py_UNICODE ch = *s++;
2679 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002680#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002681 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002682 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2683 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002685#endif
Tim Peters772747b2001-08-09 22:21:55 +00002686 STORECHAR(ch);
2687 if (ch2)
2688 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002689 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002690
2691 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002692 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002693 Py_DECREF(v);
2694 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002695#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696}
2697
2698PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2699{
2700 if (!PyUnicode_Check(unicode)) {
2701 PyErr_BadArgument();
2702 return NULL;
2703 }
2704 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2705 PyUnicode_GET_SIZE(unicode),
2706 NULL,
2707 0);
2708}
2709
2710/* --- Unicode Escape Codec ----------------------------------------------- */
2711
Fredrik Lundh06d12682001-01-24 07:59:11 +00002712static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002713
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002715 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 const char *errors)
2717{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002719 Py_ssize_t startinpos;
2720 Py_ssize_t endinpos;
2721 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002726 char* message;
2727 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 PyObject *errorHandler = NULL;
2729 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 /* Escaped strings will always be longer than the resulting
2732 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 length after conversion to the true value.
2734 (but if the error callback returns a long replacement string
2735 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 v = _PyUnicode_New(size);
2737 if (v == NULL)
2738 goto onError;
2739 if (size == 0)
2740 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 while (s < end) {
2746 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002747 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749
2750 /* Non-escape characters are interpreted as Unicode ordinals */
2751 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002752 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 continue;
2754 }
2755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 /* \ - Escapes */
2758 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002759 c = *s++;
2760 if (s > end)
2761 c = '\0'; /* Invalid after \ */
2762 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763
2764 /* \x escapes */
2765 case '\n': break;
2766 case '\\': *p++ = '\\'; break;
2767 case '\'': *p++ = '\''; break;
2768 case '\"': *p++ = '\"'; break;
2769 case 'b': *p++ = '\b'; break;
2770 case 'f': *p++ = '\014'; break; /* FF */
2771 case 't': *p++ = '\t'; break;
2772 case 'n': *p++ = '\n'; break;
2773 case 'r': *p++ = '\r'; break;
2774 case 'v': *p++ = '\013'; break; /* VT */
2775 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2776
2777 /* \OOO (octal) escapes */
2778 case '0': case '1': case '2': case '3':
2779 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002780 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002781 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002782 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002783 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002784 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002786 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 break;
2788
Fredrik Lundhccc74732001-02-18 22:13:49 +00002789 /* hex escapes */
2790 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 digits = 2;
2793 message = "truncated \\xXX escape";
2794 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 digits = 4;
2799 message = "truncated \\uXXXX escape";
2800 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002803 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002804 digits = 8;
2805 message = "truncated \\UXXXXXXXX escape";
2806 hexescape:
2807 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 outpos = p-PyUnicode_AS_UNICODE(v);
2809 if (s+digits>end) {
2810 endinpos = size;
2811 if (unicode_decode_call_errorhandler(
2812 errors, &errorHandler,
2813 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002814 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 (PyObject **)&v, &outpos, &p))
2816 goto onError;
2817 goto nextByte;
2818 }
2819 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002821 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 endinpos = (s+i+1)-starts;
2823 if (unicode_decode_call_errorhandler(
2824 errors, &errorHandler,
2825 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002826 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002828 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002830 }
2831 chr = (chr<<4) & ~0xF;
2832 if (c >= '0' && c <= '9')
2833 chr += c - '0';
2834 else if (c >= 'a' && c <= 'f')
2835 chr += 10 + c - 'a';
2836 else
2837 chr += 10 + c - 'A';
2838 }
2839 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002840 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 /* _decoding_error will have already written into the
2842 target buffer. */
2843 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002844 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002845 /* when we get here, chr is a 32-bit unicode character */
2846 if (chr <= 0xffff)
2847 /* UCS-2 character */
2848 *p++ = (Py_UNICODE) chr;
2849 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002850 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002851 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002852#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002853 *p++ = chr;
2854#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002855 chr -= 0x10000L;
2856 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002857 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002858#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002859 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 endinpos = s-starts;
2861 outpos = p-PyUnicode_AS_UNICODE(v);
2862 if (unicode_decode_call_errorhandler(
2863 errors, &errorHandler,
2864 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002865 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002867 goto onError;
2868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002869 break;
2870
2871 /* \N{name} */
2872 case 'N':
2873 message = "malformed \\N character escape";
2874 if (ucnhash_CAPI == NULL) {
2875 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002876 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002877 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 if (m == NULL)
2879 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002880 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002881 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002882 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002884 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002885 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 if (ucnhash_CAPI == NULL)
2887 goto ucnhashError;
2888 }
2889 if (*s == '{') {
2890 const char *start = s+1;
2891 /* look for the closing brace */
2892 while (*s != '}' && s < end)
2893 s++;
2894 if (s > start && s < end && *s == '}') {
2895 /* found a name. look it up in the unicode database */
2896 message = "unknown Unicode character name";
2897 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002898 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002899 goto store;
2900 }
2901 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902 endinpos = s-starts;
2903 outpos = p-PyUnicode_AS_UNICODE(v);
2904 if (unicode_decode_call_errorhandler(
2905 errors, &errorHandler,
2906 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002907 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002909 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 break;
2911
2912 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002913 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914 message = "\\ at end of string";
2915 s--;
2916 endinpos = s-starts;
2917 outpos = p-PyUnicode_AS_UNICODE(v);
2918 if (unicode_decode_call_errorhandler(
2919 errors, &errorHandler,
2920 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002921 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002923 goto onError;
2924 }
2925 else {
2926 *p++ = '\\';
2927 *p++ = (unsigned char)s[-1];
2928 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002929 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 nextByte:
2932 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002934 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002936 Py_XDECREF(errorHandler);
2937 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002939
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002941 PyErr_SetString(
2942 PyExc_UnicodeError,
2943 "\\N escapes not supported (can't load unicodedata module)"
2944 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002945 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 Py_XDECREF(errorHandler);
2947 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002948 return NULL;
2949
Fredrik Lundhccc74732001-02-18 22:13:49 +00002950onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 Py_XDECREF(errorHandler);
2953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 return NULL;
2955}
2956
2957/* Return a Unicode-Escape string version of the Unicode object.
2958
2959 If quotes is true, the string is enclosed in u"" or u'' quotes as
2960 appropriate.
2961
2962*/
2963
Thomas Wouters477c8d52006-05-27 19:21:47 +00002964Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2965 Py_ssize_t size,
2966 Py_UNICODE ch)
2967{
2968 /* like wcschr, but doesn't stop at NULL characters */
2969
2970 while (size-- > 0) {
2971 if (*s == ch)
2972 return s;
2973 s++;
2974 }
2975
2976 return NULL;
2977}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002978
Walter Dörwald79e913e2007-05-12 11:08:06 +00002979static const char *hexdigits = "0123456789abcdef";
2980
2981PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002984 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986
Thomas Wouters89f507f2006-12-13 04:49:30 +00002987 /* XXX(nnorwitz): rather than over-allocating, it would be
2988 better to choose a different scheme. Perhaps scan the
2989 first N-chars of the string and allocate based on that size.
2990 */
2991 /* Initial allocation is based on the longest-possible unichr
2992 escape.
2993
2994 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2995 unichr, so in this case it's the longest unichr escape. In
2996 narrow (UTF-16) builds this is five chars per source unichr
2997 since there are two unichrs in the surrogate pair, so in narrow
2998 (UTF-16) builds it's not the longest unichr escape.
2999
3000 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3001 so in the narrow (UTF-16) build case it's the longest unichr
3002 escape.
3003 */
3004
Christian Heimes9c4756e2008-05-26 13:22:05 +00003005 repr = PyByteArray_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003006#ifdef Py_UNICODE_WIDE
3007 + 10*size
3008#else
3009 + 6*size
3010#endif
3011 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 if (repr == NULL)
3013 return NULL;
3014
Christian Heimes9c4756e2008-05-26 13:22:05 +00003015 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 while (size-- > 0) {
3018 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003019
Walter Dörwald79e913e2007-05-12 11:08:06 +00003020 /* Escape backslashes */
3021 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 *p++ = '\\';
3023 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003024 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003025 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003026
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003027#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003028 /* Map 21-bit characters to '\U00xxxxxx' */
3029 else if (ch >= 0x10000) {
3030 *p++ = '\\';
3031 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003032 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3033 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3034 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3035 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3036 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3037 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3038 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3039 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003040 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003041 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003042#else
3043 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003044 else if (ch >= 0xD800 && ch < 0xDC00) {
3045 Py_UNICODE ch2;
3046 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003047
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003048 ch2 = *s++;
3049 size--;
3050 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3051 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3052 *p++ = '\\';
3053 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003054 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3055 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3056 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3057 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3058 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3059 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3060 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3061 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003062 continue;
3063 }
3064 /* Fall through: isolated surrogates are copied as-is */
3065 s--;
3066 size++;
3067 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003068#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003071 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 *p++ = '\\';
3073 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003074 *p++ = hexdigits[(ch >> 12) & 0x000F];
3075 *p++ = hexdigits[(ch >> 8) & 0x000F];
3076 *p++ = hexdigits[(ch >> 4) & 0x000F];
3077 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003079
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003080 /* Map special whitespace to '\t', \n', '\r' */
3081 else if (ch == '\t') {
3082 *p++ = '\\';
3083 *p++ = 't';
3084 }
3085 else if (ch == '\n') {
3086 *p++ = '\\';
3087 *p++ = 'n';
3088 }
3089 else if (ch == '\r') {
3090 *p++ = '\\';
3091 *p++ = 'r';
3092 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003093
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003094 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003095 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003097 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003098 *p++ = hexdigits[(ch >> 4) & 0x000F];
3099 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003101
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 /* Copy everything else as-is */
3103 else
3104 *p++ = (char) ch;
3105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106
Christian Heimes72b710a2008-05-26 13:28:38 +00003107 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003108 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003109 Py_DECREF(repr);
3110 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111}
3112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3114{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003115 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 if (!PyUnicode_Check(unicode)) {
3117 PyErr_BadArgument();
3118 return NULL;
3119 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003120 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3121 PyUnicode_GET_SIZE(unicode));
3122
3123 if (!s)
3124 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003125 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003126 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003127 Py_DECREF(s);
3128 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129}
3130
3131/* --- Raw Unicode Escape Codec ------------------------------------------- */
3132
3133PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003134 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 const char *errors)
3136{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003137 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003138 Py_ssize_t startinpos;
3139 Py_ssize_t endinpos;
3140 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 const char *end;
3144 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003145 PyObject *errorHandler = NULL;
3146 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 /* Escaped strings will always be longer than the resulting
3149 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 length after conversion to the true value. (But decoding error
3151 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 v = _PyUnicode_New(size);
3153 if (v == NULL)
3154 goto onError;
3155 if (size == 0)
3156 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 end = s + size;
3159 while (s < end) {
3160 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003161 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003163 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
3165 /* Non-escape characters are interpreted as Unicode ordinals */
3166 if (*s != '\\') {
3167 *p++ = (unsigned char)*s++;
3168 continue;
3169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
3172 /* \u-escapes are only interpreted iff the number of leading
3173 backslashes if odd */
3174 bs = s;
3175 for (;s < end;) {
3176 if (*s != '\\')
3177 break;
3178 *p++ = (unsigned char)*s++;
3179 }
3180 if (((s - bs) & 1) == 0 ||
3181 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003182 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 continue;
3184 }
3185 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003186 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 s++;
3188
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003189 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003191 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003193 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 endinpos = s-starts;
3195 if (unicode_decode_call_errorhandler(
3196 errors, &errorHandler,
3197 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003198 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 }
3203 x = (x<<4) & ~0xF;
3204 if (c >= '0' && c <= '9')
3205 x += c - '0';
3206 else if (c >= 'a' && c <= 'f')
3207 x += 10 + c - 'a';
3208 else
3209 x += 10 + c - 'A';
3210 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003211 if (x <= 0xffff)
3212 /* UCS-2 character */
3213 *p++ = (Py_UNICODE) x;
3214 else if (x <= 0x10ffff) {
3215 /* UCS-4 character. Either store directly, or as
3216 surrogate pair. */
3217#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003218 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003219#else
3220 x -= 0x10000L;
3221 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3222 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3223#endif
3224 } else {
3225 endinpos = s-starts;
3226 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003227 if (unicode_decode_call_errorhandler(
3228 errors, &errorHandler,
3229 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003230 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003231 (PyObject **)&v, &outpos, &p))
3232 goto onError;
3233 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 nextByte:
3235 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003237 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003238 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 Py_XDECREF(errorHandler);
3240 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 onError:
3244 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 Py_XDECREF(errorHandler);
3246 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return NULL;
3248}
3249
3250PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003251 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003253 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 char *p;
3255 char *q;
3256
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257#ifdef Py_UNICODE_WIDE
Christian Heimes9c4756e2008-05-26 13:22:05 +00003258 repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#else
Christian Heimes9c4756e2008-05-26 13:22:05 +00003260 repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 if (repr == NULL)
3263 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003264 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003265 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266
Christian Heimes9c4756e2008-05-26 13:22:05 +00003267 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 while (size-- > 0) {
3269 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003270#ifdef Py_UNICODE_WIDE
3271 /* Map 32-bit characters to '\Uxxxxxxxx' */
3272 if (ch >= 0x10000) {
3273 *p++ = '\\';
3274 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003275 *p++ = hexdigits[(ch >> 28) & 0xf];
3276 *p++ = hexdigits[(ch >> 24) & 0xf];
3277 *p++ = hexdigits[(ch >> 20) & 0xf];
3278 *p++ = hexdigits[(ch >> 16) & 0xf];
3279 *p++ = hexdigits[(ch >> 12) & 0xf];
3280 *p++ = hexdigits[(ch >> 8) & 0xf];
3281 *p++ = hexdigits[(ch >> 4) & 0xf];
3282 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003283 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003285#else
3286 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3287 if (ch >= 0xD800 && ch < 0xDC00) {
3288 Py_UNICODE ch2;
3289 Py_UCS4 ucs;
3290
3291 ch2 = *s++;
3292 size--;
3293 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3294 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3295 *p++ = '\\';
3296 *p++ = 'U';
3297 *p++ = hexdigits[(ucs >> 28) & 0xf];
3298 *p++ = hexdigits[(ucs >> 24) & 0xf];
3299 *p++ = hexdigits[(ucs >> 20) & 0xf];
3300 *p++ = hexdigits[(ucs >> 16) & 0xf];
3301 *p++ = hexdigits[(ucs >> 12) & 0xf];
3302 *p++ = hexdigits[(ucs >> 8) & 0xf];
3303 *p++ = hexdigits[(ucs >> 4) & 0xf];
3304 *p++ = hexdigits[ucs & 0xf];
3305 continue;
3306 }
3307 /* Fall through: isolated surrogates are copied as-is */
3308 s--;
3309 size++;
3310 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003311#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 /* Map 16-bit characters to '\uxxxx' */
3313 if (ch >= 256) {
3314 *p++ = '\\';
3315 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003316 *p++ = hexdigits[(ch >> 12) & 0xf];
3317 *p++ = hexdigits[(ch >> 8) & 0xf];
3318 *p++ = hexdigits[(ch >> 4) & 0xf];
3319 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 }
3321 /* Copy everything else as-is */
3322 else
3323 *p++ = (char) ch;
3324 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003325 size = p - q;
3326
3327 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003328 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003329 Py_DECREF(repr);
3330 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331}
3332
3333PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3334{
Walter Dörwald711005d2007-05-12 12:03:26 +00003335 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003337 PyErr_BadArgument();
3338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003340 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3341 PyUnicode_GET_SIZE(unicode));
3342
3343 if (!s)
3344 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003345 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003346 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003347 Py_DECREF(s);
3348 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349}
3350
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003351/* --- Unicode Internal Codec ------------------------------------------- */
3352
3353PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003354 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003355 const char *errors)
3356{
3357 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003358 Py_ssize_t startinpos;
3359 Py_ssize_t endinpos;
3360 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003361 PyUnicodeObject *v;
3362 Py_UNICODE *p;
3363 const char *end;
3364 const char *reason;
3365 PyObject *errorHandler = NULL;
3366 PyObject *exc = NULL;
3367
Neal Norwitzd43069c2006-01-08 01:12:10 +00003368#ifdef Py_UNICODE_WIDE
3369 Py_UNICODE unimax = PyUnicode_GetMax();
3370#endif
3371
Thomas Wouters89f507f2006-12-13 04:49:30 +00003372 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003373 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3374 if (v == NULL)
3375 goto onError;
3376 if (PyUnicode_GetSize((PyObject *)v) == 0)
3377 return (PyObject *)v;
3378 p = PyUnicode_AS_UNICODE(v);
3379 end = s + size;
3380
3381 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003382 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003383 /* We have to sanity check the raw data, otherwise doom looms for
3384 some malformed UCS-4 data. */
3385 if (
3386 #ifdef Py_UNICODE_WIDE
3387 *p > unimax || *p < 0 ||
3388 #endif
3389 end-s < Py_UNICODE_SIZE
3390 )
3391 {
3392 startinpos = s - starts;
3393 if (end-s < Py_UNICODE_SIZE) {
3394 endinpos = end-starts;
3395 reason = "truncated input";
3396 }
3397 else {
3398 endinpos = s - starts + Py_UNICODE_SIZE;
3399 reason = "illegal code point (> 0x10FFFF)";
3400 }
3401 outpos = p - PyUnicode_AS_UNICODE(v);
3402 if (unicode_decode_call_errorhandler(
3403 errors, &errorHandler,
3404 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003405 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 (PyObject **)&v, &outpos, &p)) {
3407 goto onError;
3408 }
3409 }
3410 else {
3411 p++;
3412 s += Py_UNICODE_SIZE;
3413 }
3414 }
3415
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003416 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003417 goto onError;
3418 Py_XDECREF(errorHandler);
3419 Py_XDECREF(exc);
3420 return (PyObject *)v;
3421
3422 onError:
3423 Py_XDECREF(v);
3424 Py_XDECREF(errorHandler);
3425 Py_XDECREF(exc);
3426 return NULL;
3427}
3428
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429/* --- Latin-1 Codec ------------------------------------------------------ */
3430
3431PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003432 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 const char *errors)
3434{
3435 PyUnicodeObject *v;
3436 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003437
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003439 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003440 Py_UNICODE r = *(unsigned char*)s;
3441 return PyUnicode_FromUnicode(&r, 1);
3442 }
3443
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 v = _PyUnicode_New(size);
3445 if (v == NULL)
3446 goto onError;
3447 if (size == 0)
3448 return (PyObject *)v;
3449 p = PyUnicode_AS_UNICODE(v);
3450 while (size-- > 0)
3451 *p++ = (unsigned char)*s++;
3452 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003453
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 onError:
3455 Py_XDECREF(v);
3456 return NULL;
3457}
3458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459/* create or adjust a UnicodeEncodeError */
3460static void make_encode_exception(PyObject **exceptionObject,
3461 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003462 const Py_UNICODE *unicode, Py_ssize_t size,
3463 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 if (*exceptionObject == NULL) {
3467 *exceptionObject = PyUnicodeEncodeError_Create(
3468 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 }
3470 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3472 goto onError;
3473 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3474 goto onError;
3475 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3476 goto onError;
3477 return;
3478 onError:
3479 Py_DECREF(*exceptionObject);
3480 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 }
3482}
3483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484/* raises a UnicodeEncodeError */
3485static void raise_encode_exception(PyObject **exceptionObject,
3486 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003487 const Py_UNICODE *unicode, Py_ssize_t size,
3488 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 const char *reason)
3490{
3491 make_encode_exception(exceptionObject,
3492 encoding, unicode, size, startpos, endpos, reason);
3493 if (*exceptionObject != NULL)
3494 PyCodec_StrictErrors(*exceptionObject);
3495}
3496
3497/* error handling callback helper:
3498 build arguments, call the callback and check the arguments,
3499 put the result into newpos and return the replacement string, which
3500 has to be freed by the caller */
3501static PyObject *unicode_encode_call_errorhandler(const char *errors,
3502 PyObject **errorHandler,
3503 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003504 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3505 Py_ssize_t startpos, Py_ssize_t endpos,
3506 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509
3510 PyObject *restuple;
3511 PyObject *resunicode;
3512
3513 if (*errorHandler == NULL) {
3514 *errorHandler = PyCodec_LookupError(errors);
3515 if (*errorHandler == NULL)
3516 return NULL;
3517 }
3518
3519 make_encode_exception(exceptionObject,
3520 encoding, unicode, size, startpos, endpos, reason);
3521 if (*exceptionObject == NULL)
3522 return NULL;
3523
3524 restuple = PyObject_CallFunctionObjArgs(
3525 *errorHandler, *exceptionObject, NULL);
3526 if (restuple == NULL)
3527 return NULL;
3528 if (!PyTuple_Check(restuple)) {
3529 PyErr_Format(PyExc_TypeError, &argparse[4]);
3530 Py_DECREF(restuple);
3531 return NULL;
3532 }
3533 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3534 &resunicode, newpos)) {
3535 Py_DECREF(restuple);
3536 return NULL;
3537 }
3538 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003539 *newpos = size+*newpos;
3540 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003541 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003542 Py_DECREF(restuple);
3543 return NULL;
3544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 Py_INCREF(resunicode);
3546 Py_DECREF(restuple);
3547 return resunicode;
3548}
3549
3550static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 const char *errors,
3553 int limit)
3554{
3555 /* output object */
3556 PyObject *res;
3557 /* pointers to the beginning and end+1 of input */
3558 const Py_UNICODE *startp = p;
3559 const Py_UNICODE *endp = p + size;
3560 /* pointer to the beginning of the unencodable characters */
3561 /* const Py_UNICODE *badp = NULL; */
3562 /* pointer into the output */
3563 char *str;
3564 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003566 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3567 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 PyObject *errorHandler = NULL;
3569 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003570 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 /* the following variable is used for caching string comparisons
3572 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3573 int known_errorHandler = -1;
3574
3575 /* allocate enough for a simple encoding without
3576 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003577 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003578 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003579 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003581 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003582 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 ressize = size;
3584
3585 while (p<endp) {
3586 Py_UNICODE c = *p;
3587
3588 /* can we encode this? */
3589 if (c<limit) {
3590 /* no overflow check, because we know that the space is enough */
3591 *str++ = (char)c;
3592 ++p;
3593 }
3594 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t unicodepos = p-startp;
3596 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003598 Py_ssize_t repsize;
3599 Py_ssize_t newpos;
3600 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_UNICODE *uni2;
3602 /* startpos for collecting unencodable chars */
3603 const Py_UNICODE *collstart = p;
3604 const Py_UNICODE *collend = p;
3605 /* find all unecodable characters */
3606 while ((collend < endp) && ((*collend)>=limit))
3607 ++collend;
3608 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3609 if (known_errorHandler==-1) {
3610 if ((errors==NULL) || (!strcmp(errors, "strict")))
3611 known_errorHandler = 1;
3612 else if (!strcmp(errors, "replace"))
3613 known_errorHandler = 2;
3614 else if (!strcmp(errors, "ignore"))
3615 known_errorHandler = 3;
3616 else if (!strcmp(errors, "xmlcharrefreplace"))
3617 known_errorHandler = 4;
3618 else
3619 known_errorHandler = 0;
3620 }
3621 switch (known_errorHandler) {
3622 case 1: /* strict */
3623 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3624 goto onError;
3625 case 2: /* replace */
3626 while (collstart++<collend)
3627 *str++ = '?'; /* fall through */
3628 case 3: /* ignore */
3629 p = collend;
3630 break;
3631 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003632 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 /* determine replacement size (temporarily (mis)uses p) */
3634 for (p = collstart, repsize = 0; p < collend; ++p) {
3635 if (*p<10)
3636 repsize += 2+1+1;
3637 else if (*p<100)
3638 repsize += 2+2+1;
3639 else if (*p<1000)
3640 repsize += 2+3+1;
3641 else if (*p<10000)
3642 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003643#ifndef Py_UNICODE_WIDE
3644 else
3645 repsize += 2+5+1;
3646#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 else if (*p<100000)
3648 repsize += 2+5+1;
3649 else if (*p<1000000)
3650 repsize += 2+6+1;
3651 else
3652 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003653#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 }
3655 requiredsize = respos+repsize+(endp-collend);
3656 if (requiredsize > ressize) {
3657 if (requiredsize<2*ressize)
3658 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003659 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003661 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 ressize = requiredsize;
3663 }
3664 /* generate replacement (temporarily (mis)uses p) */
3665 for (p = collstart; p < collend; ++p) {
3666 str += sprintf(str, "&#%d;", (int)*p);
3667 }
3668 p = collend;
3669 break;
3670 default:
3671 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3672 encoding, reason, startp, size, &exc,
3673 collstart-startp, collend-startp, &newpos);
3674 if (repunicode == NULL)
3675 goto onError;
3676 /* need more space? (at least enough for what we
3677 have+the replacement+the rest of the string, so
3678 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003679 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 repsize = PyUnicode_GET_SIZE(repunicode);
3681 requiredsize = respos+repsize+(endp-collend);
3682 if (requiredsize > ressize) {
3683 if (requiredsize<2*ressize)
3684 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003685 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_DECREF(repunicode);
3687 goto onError;
3688 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003689 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 ressize = requiredsize;
3691 }
3692 /* check if there is anything unencodable in the replacement
3693 and copy it to the output */
3694 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3695 c = *uni2;
3696 if (c >= limit) {
3697 raise_encode_exception(&exc, encoding, startp, size,
3698 unicodepos, unicodepos+1, reason);
3699 Py_DECREF(repunicode);
3700 goto onError;
3701 }
3702 *str = (char)c;
3703 }
3704 p = startp + newpos;
3705 Py_DECREF(repunicode);
3706 }
3707 }
3708 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003709 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003710 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711 onError:
3712 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 Py_XDECREF(errorHandler);
3714 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003715 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716}
3717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003719 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 const char *errors)
3721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723}
3724
3725PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3726{
3727 if (!PyUnicode_Check(unicode)) {
3728 PyErr_BadArgument();
3729 return NULL;
3730 }
3731 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3732 PyUnicode_GET_SIZE(unicode),
3733 NULL);
3734}
3735
3736/* --- 7-bit ASCII Codec -------------------------------------------------- */
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 const char *errors)
3741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 PyUnicodeObject *v;
3744 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t startinpos;
3746 Py_ssize_t endinpos;
3747 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 const char *e;
3749 PyObject *errorHandler = NULL;
3750 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003753 if (size == 1 && *(unsigned char*)s < 128) {
3754 Py_UNICODE r = *(unsigned char*)s;
3755 return PyUnicode_FromUnicode(&r, 1);
3756 }
Tim Petersced69f82003-09-16 20:30:58 +00003757
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 v = _PyUnicode_New(size);
3759 if (v == NULL)
3760 goto onError;
3761 if (size == 0)
3762 return (PyObject *)v;
3763 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 e = s + size;
3765 while (s < e) {
3766 register unsigned char c = (unsigned char)*s;
3767 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 ++s;
3770 }
3771 else {
3772 startinpos = s-starts;
3773 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003774 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 if (unicode_decode_call_errorhandler(
3776 errors, &errorHandler,
3777 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003778 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003783 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003784 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 Py_XDECREF(errorHandler);
3787 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 onError:
3791 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 Py_XDECREF(errorHandler);
3793 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 return NULL;
3795}
3796
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003798 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 const char *errors)
3800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802}
3803
3804PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3805{
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
3810 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3811 PyUnicode_GET_SIZE(unicode),
3812 NULL);
3813}
3814
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003815#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003816
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003817/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003818
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003819#if SIZEOF_INT < SIZEOF_SSIZE_T
3820#define NEED_RETRY
3821#endif
3822
3823/* XXX This code is limited to "true" double-byte encodings, as
3824 a) it assumes an incomplete character consists of a single byte, and
3825 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3826 encodings, see IsDBCSLeadByteEx documentation. */
3827
3828static int is_dbcs_lead_byte(const char *s, int offset)
3829{
3830 const char *curr = s + offset;
3831
3832 if (IsDBCSLeadByte(*curr)) {
3833 const char *prev = CharPrev(s, curr);
3834 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3835 }
3836 return 0;
3837}
3838
3839/*
3840 * Decode MBCS string into unicode object. If 'final' is set, converts
3841 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3842 */
3843static int decode_mbcs(PyUnicodeObject **v,
3844 const char *s, /* MBCS string */
3845 int size, /* sizeof MBCS string */
3846 int final)
3847{
3848 Py_UNICODE *p;
3849 Py_ssize_t n = 0;
3850 int usize = 0;
3851
3852 assert(size >= 0);
3853
3854 /* Skip trailing lead-byte unless 'final' is set */
3855 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3856 --size;
3857
3858 /* First get the size of the result */
3859 if (size > 0) {
3860 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3861 if (usize == 0) {
3862 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3863 return -1;
3864 }
3865 }
3866
3867 if (*v == NULL) {
3868 /* Create unicode object */
3869 *v = _PyUnicode_New(usize);
3870 if (*v == NULL)
3871 return -1;
3872 }
3873 else {
3874 /* Extend unicode object */
3875 n = PyUnicode_GET_SIZE(*v);
3876 if (_PyUnicode_Resize(v, n + usize) < 0)
3877 return -1;
3878 }
3879
3880 /* Do the conversion */
3881 if (size > 0) {
3882 p = PyUnicode_AS_UNICODE(*v) + n;
3883 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3884 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3885 return -1;
3886 }
3887 }
3888
3889 return size;
3890}
3891
3892PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3893 Py_ssize_t size,
3894 const char *errors,
3895 Py_ssize_t *consumed)
3896{
3897 PyUnicodeObject *v = NULL;
3898 int done;
3899
3900 if (consumed)
3901 *consumed = 0;
3902
3903#ifdef NEED_RETRY
3904 retry:
3905 if (size > INT_MAX)
3906 done = decode_mbcs(&v, s, INT_MAX, 0);
3907 else
3908#endif
3909 done = decode_mbcs(&v, s, (int)size, !consumed);
3910
3911 if (done < 0) {
3912 Py_XDECREF(v);
3913 return NULL;
3914 }
3915
3916 if (consumed)
3917 *consumed += done;
3918
3919#ifdef NEED_RETRY
3920 if (size > INT_MAX) {
3921 s += done;
3922 size -= done;
3923 goto retry;
3924 }
3925#endif
3926
3927 return (PyObject *)v;
3928}
3929
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003930PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003931 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003932 const char *errors)
3933{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003934 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3935}
3936
3937/*
3938 * Convert unicode into string object (MBCS).
3939 * Returns 0 if succeed, -1 otherwise.
3940 */
3941static int encode_mbcs(PyObject **repr,
3942 const Py_UNICODE *p, /* unicode */
3943 int size) /* size of unicode */
3944{
3945 int mbcssize = 0;
3946 Py_ssize_t n = 0;
3947
3948 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003949
3950 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003951 if (size > 0) {
3952 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3953 if (mbcssize == 0) {
3954 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3955 return -1;
3956 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003957 }
3958
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003959 if (*repr == NULL) {
3960 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00003961 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003962 if (*repr == NULL)
3963 return -1;
3964 }
3965 else {
3966 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00003967 n = PyBytes_Size(*repr);
3968 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003969 return -1;
3970 }
3971
3972 /* Do the conversion */
3973 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00003974 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003975 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3976 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3977 return -1;
3978 }
3979 }
3980
3981 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003982}
3983
3984PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003986 const char *errors)
3987{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003988 PyObject *repr = NULL;
3989 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003990
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003991#ifdef NEED_RETRY
3992 retry:
3993 if (size > INT_MAX)
3994 ret = encode_mbcs(&repr, p, INT_MAX);
3995 else
3996#endif
3997 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003998
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003999 if (ret < 0) {
4000 Py_XDECREF(repr);
4001 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004002 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004003
4004#ifdef NEED_RETRY
4005 if (size > INT_MAX) {
4006 p += INT_MAX;
4007 size -= INT_MAX;
4008 goto retry;
4009 }
4010#endif
4011
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004012 return repr;
4013}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004014
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004015PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4016{
4017 if (!PyUnicode_Check(unicode)) {
4018 PyErr_BadArgument();
4019 return NULL;
4020 }
4021 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4022 PyUnicode_GET_SIZE(unicode),
4023 NULL);
4024}
4025
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004026#undef NEED_RETRY
4027
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004028#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030/* --- Character Mapping Codec -------------------------------------------- */
4031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 PyObject *mapping,
4035 const char *errors)
4036{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004038 Py_ssize_t startinpos;
4039 Py_ssize_t endinpos;
4040 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 PyUnicodeObject *v;
4043 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 PyObject *errorHandler = NULL;
4046 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004047 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004048 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 /* Default to Latin-1 */
4051 if (mapping == NULL)
4052 return PyUnicode_DecodeLatin1(s, size, errors);
4053
4054 v = _PyUnicode_New(size);
4055 if (v == NULL)
4056 goto onError;
4057 if (size == 0)
4058 return (PyObject *)v;
4059 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004061 if (PyUnicode_CheckExact(mapping)) {
4062 mapstring = PyUnicode_AS_UNICODE(mapping);
4063 maplen = PyUnicode_GET_SIZE(mapping);
4064 while (s < e) {
4065 unsigned char ch = *s;
4066 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004068 if (ch < maplen)
4069 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004071 if (x == 0xfffe) {
4072 /* undefined mapping */
4073 outpos = p-PyUnicode_AS_UNICODE(v);
4074 startinpos = s-starts;
4075 endinpos = startinpos+1;
4076 if (unicode_decode_call_errorhandler(
4077 errors, &errorHandler,
4078 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004079 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004080 (PyObject **)&v, &outpos, &p)) {
4081 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004082 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004083 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004084 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004085 *p++ = x;
4086 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004088 }
4089 else {
4090 while (s < e) {
4091 unsigned char ch = *s;
4092 PyObject *w, *x;
4093
4094 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004095 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004096 if (w == NULL)
4097 goto onError;
4098 x = PyObject_GetItem(mapping, w);
4099 Py_DECREF(w);
4100 if (x == NULL) {
4101 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4102 /* No mapping found means: mapping is undefined. */
4103 PyErr_Clear();
4104 x = Py_None;
4105 Py_INCREF(x);
4106 } else
4107 goto onError;
4108 }
4109
4110 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004111 if (PyLong_Check(x)) {
4112 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004113 if (value < 0 || value > 65535) {
4114 PyErr_SetString(PyExc_TypeError,
4115 "character mapping must be in range(65536)");
4116 Py_DECREF(x);
4117 goto onError;
4118 }
4119 *p++ = (Py_UNICODE)value;
4120 }
4121 else if (x == Py_None) {
4122 /* undefined mapping */
4123 outpos = p-PyUnicode_AS_UNICODE(v);
4124 startinpos = s-starts;
4125 endinpos = startinpos+1;
4126 if (unicode_decode_call_errorhandler(
4127 errors, &errorHandler,
4128 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004129 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004130 (PyObject **)&v, &outpos, &p)) {
4131 Py_DECREF(x);
4132 goto onError;
4133 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004134 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004135 continue;
4136 }
4137 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004138 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004139
4140 if (targetsize == 1)
4141 /* 1-1 mapping */
4142 *p++ = *PyUnicode_AS_UNICODE(x);
4143
4144 else if (targetsize > 1) {
4145 /* 1-n mapping */
4146 if (targetsize > extrachars) {
4147 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004148 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4149 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004150 (targetsize << 2);
4151 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004152 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004153 if (_PyUnicode_Resize(&v,
4154 PyUnicode_GET_SIZE(v) + needed) < 0) {
4155 Py_DECREF(x);
4156 goto onError;
4157 }
4158 p = PyUnicode_AS_UNICODE(v) + oldpos;
4159 }
4160 Py_UNICODE_COPY(p,
4161 PyUnicode_AS_UNICODE(x),
4162 targetsize);
4163 p += targetsize;
4164 extrachars -= targetsize;
4165 }
4166 /* 1-0 mapping: skip the character */
4167 }
4168 else {
4169 /* wrong return value */
4170 PyErr_SetString(PyExc_TypeError,
4171 "character mapping must return integer, None or unicode");
4172 Py_DECREF(x);
4173 goto onError;
4174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004176 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 }
4179 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004180 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 Py_XDECREF(errorHandler);
4183 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004185
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 Py_XDECREF(errorHandler);
4188 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 Py_XDECREF(v);
4190 return NULL;
4191}
4192
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004193/* Charmap encoding: the lookup table */
4194
4195struct encoding_map{
4196 PyObject_HEAD
4197 unsigned char level1[32];
4198 int count2, count3;
4199 unsigned char level23[1];
4200};
4201
4202static PyObject*
4203encoding_map_size(PyObject *obj, PyObject* args)
4204{
4205 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004206 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004207 128*map->count3);
4208}
4209
4210static PyMethodDef encoding_map_methods[] = {
4211 {"size", encoding_map_size, METH_NOARGS,
4212 PyDoc_STR("Return the size (in bytes) of this object") },
4213 { 0 }
4214};
4215
4216static void
4217encoding_map_dealloc(PyObject* o)
4218{
4219 PyObject_FREE(o);
4220}
4221
4222static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004223 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004224 "EncodingMap", /*tp_name*/
4225 sizeof(struct encoding_map), /*tp_basicsize*/
4226 0, /*tp_itemsize*/
4227 /* methods */
4228 encoding_map_dealloc, /*tp_dealloc*/
4229 0, /*tp_print*/
4230 0, /*tp_getattr*/
4231 0, /*tp_setattr*/
4232 0, /*tp_compare*/
4233 0, /*tp_repr*/
4234 0, /*tp_as_number*/
4235 0, /*tp_as_sequence*/
4236 0, /*tp_as_mapping*/
4237 0, /*tp_hash*/
4238 0, /*tp_call*/
4239 0, /*tp_str*/
4240 0, /*tp_getattro*/
4241 0, /*tp_setattro*/
4242 0, /*tp_as_buffer*/
4243 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4244 0, /*tp_doc*/
4245 0, /*tp_traverse*/
4246 0, /*tp_clear*/
4247 0, /*tp_richcompare*/
4248 0, /*tp_weaklistoffset*/
4249 0, /*tp_iter*/
4250 0, /*tp_iternext*/
4251 encoding_map_methods, /*tp_methods*/
4252 0, /*tp_members*/
4253 0, /*tp_getset*/
4254 0, /*tp_base*/
4255 0, /*tp_dict*/
4256 0, /*tp_descr_get*/
4257 0, /*tp_descr_set*/
4258 0, /*tp_dictoffset*/
4259 0, /*tp_init*/
4260 0, /*tp_alloc*/
4261 0, /*tp_new*/
4262 0, /*tp_free*/
4263 0, /*tp_is_gc*/
4264};
4265
4266PyObject*
4267PyUnicode_BuildEncodingMap(PyObject* string)
4268{
4269 Py_UNICODE *decode;
4270 PyObject *result;
4271 struct encoding_map *mresult;
4272 int i;
4273 int need_dict = 0;
4274 unsigned char level1[32];
4275 unsigned char level2[512];
4276 unsigned char *mlevel1, *mlevel2, *mlevel3;
4277 int count2 = 0, count3 = 0;
4278
4279 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4280 PyErr_BadArgument();
4281 return NULL;
4282 }
4283 decode = PyUnicode_AS_UNICODE(string);
4284 memset(level1, 0xFF, sizeof level1);
4285 memset(level2, 0xFF, sizeof level2);
4286
4287 /* If there isn't a one-to-one mapping of NULL to \0,
4288 or if there are non-BMP characters, we need to use
4289 a mapping dictionary. */
4290 if (decode[0] != 0)
4291 need_dict = 1;
4292 for (i = 1; i < 256; i++) {
4293 int l1, l2;
4294 if (decode[i] == 0
4295 #ifdef Py_UNICODE_WIDE
4296 || decode[i] > 0xFFFF
4297 #endif
4298 ) {
4299 need_dict = 1;
4300 break;
4301 }
4302 if (decode[i] == 0xFFFE)
4303 /* unmapped character */
4304 continue;
4305 l1 = decode[i] >> 11;
4306 l2 = decode[i] >> 7;
4307 if (level1[l1] == 0xFF)
4308 level1[l1] = count2++;
4309 if (level2[l2] == 0xFF)
4310 level2[l2] = count3++;
4311 }
4312
4313 if (count2 >= 0xFF || count3 >= 0xFF)
4314 need_dict = 1;
4315
4316 if (need_dict) {
4317 PyObject *result = PyDict_New();
4318 PyObject *key, *value;
4319 if (!result)
4320 return NULL;
4321 for (i = 0; i < 256; i++) {
4322 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004323 key = PyLong_FromLong(decode[i]);
4324 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004325 if (!key || !value)
4326 goto failed1;
4327 if (PyDict_SetItem(result, key, value) == -1)
4328 goto failed1;
4329 Py_DECREF(key);
4330 Py_DECREF(value);
4331 }
4332 return result;
4333 failed1:
4334 Py_XDECREF(key);
4335 Py_XDECREF(value);
4336 Py_DECREF(result);
4337 return NULL;
4338 }
4339
4340 /* Create a three-level trie */
4341 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4342 16*count2 + 128*count3 - 1);
4343 if (!result)
4344 return PyErr_NoMemory();
4345 PyObject_Init(result, &EncodingMapType);
4346 mresult = (struct encoding_map*)result;
4347 mresult->count2 = count2;
4348 mresult->count3 = count3;
4349 mlevel1 = mresult->level1;
4350 mlevel2 = mresult->level23;
4351 mlevel3 = mresult->level23 + 16*count2;
4352 memcpy(mlevel1, level1, 32);
4353 memset(mlevel2, 0xFF, 16*count2);
4354 memset(mlevel3, 0, 128*count3);
4355 count3 = 0;
4356 for (i = 1; i < 256; i++) {
4357 int o1, o2, o3, i2, i3;
4358 if (decode[i] == 0xFFFE)
4359 /* unmapped character */
4360 continue;
4361 o1 = decode[i]>>11;
4362 o2 = (decode[i]>>7) & 0xF;
4363 i2 = 16*mlevel1[o1] + o2;
4364 if (mlevel2[i2] == 0xFF)
4365 mlevel2[i2] = count3++;
4366 o3 = decode[i] & 0x7F;
4367 i3 = 128*mlevel2[i2] + o3;
4368 mlevel3[i3] = i;
4369 }
4370 return result;
4371}
4372
4373static int
4374encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4375{
4376 struct encoding_map *map = (struct encoding_map*)mapping;
4377 int l1 = c>>11;
4378 int l2 = (c>>7) & 0xF;
4379 int l3 = c & 0x7F;
4380 int i;
4381
4382#ifdef Py_UNICODE_WIDE
4383 if (c > 0xFFFF) {
4384 return -1;
4385 }
4386#endif
4387 if (c == 0)
4388 return 0;
4389 /* level 1*/
4390 i = map->level1[l1];
4391 if (i == 0xFF) {
4392 return -1;
4393 }
4394 /* level 2*/
4395 i = map->level23[16*i+l2];
4396 if (i == 0xFF) {
4397 return -1;
4398 }
4399 /* level 3 */
4400 i = map->level23[16*map->count2 + 128*i + l3];
4401 if (i == 0) {
4402 return -1;
4403 }
4404 return i;
4405}
4406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407/* Lookup the character ch in the mapping. If the character
4408 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004409 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411{
Christian Heimes217cfd12007-12-02 14:31:20 +00004412 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 PyObject *x;
4414
4415 if (w == NULL)
4416 return NULL;
4417 x = PyObject_GetItem(mapping, w);
4418 Py_DECREF(w);
4419 if (x == NULL) {
4420 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4421 /* No mapping found means: mapping is undefined. */
4422 PyErr_Clear();
4423 x = Py_None;
4424 Py_INCREF(x);
4425 return x;
4426 } else
4427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004429 else if (x == Py_None)
4430 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004431 else if (PyLong_Check(x)) {
4432 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 if (value < 0 || value > 255) {
4434 PyErr_SetString(PyExc_TypeError,
4435 "character mapping must be in range(256)");
4436 Py_DECREF(x);
4437 return NULL;
4438 }
4439 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004441 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004445 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004446 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004447 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 Py_DECREF(x);
4449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 }
4451}
4452
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004453static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004454charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004455{
Christian Heimes72b710a2008-05-26 13:28:38 +00004456 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 /* exponentially overallocate to minimize reallocations */
4458 if (requiredsize < 2*outsize)
4459 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004460 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004461 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004462 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004463}
4464
4465typedef enum charmapencode_result {
4466 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4467}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004469 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 space is available. Return a new reference to the object that
4471 was put in the output buffer, or Py_None, if the mapping was undefined
4472 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004473 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004475charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004476 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004478 PyObject *rep;
4479 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004480 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481
Christian Heimes90aa7642007-12-19 02:45:37 +00004482 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004483 int res = encoding_map_lookup(c, mapping);
4484 Py_ssize_t requiredsize = *outpos+1;
4485 if (res == -1)
4486 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004487 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004488 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004489 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004490 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004491 outstart[(*outpos)++] = (char)res;
4492 return enc_SUCCESS;
4493 }
4494
4495 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004497 return enc_EXCEPTION;
4498 else if (rep==Py_None) {
4499 Py_DECREF(rep);
4500 return enc_FAILED;
4501 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004502 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004504 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004505 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004507 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004509 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004510 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 }
4512 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004513 const char *repchars = PyBytes_AS_STRING(rep);
4514 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004515 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004516 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004517 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004519 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004521 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 memcpy(outstart + *outpos, repchars, repsize);
4523 *outpos += repsize;
4524 }
4525 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004526 Py_DECREF(rep);
4527 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528}
4529
4530/* handle an error in PyUnicode_EncodeCharmap
4531 Return 0 on success, -1 on error */
4532static
4533int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004534 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004536 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004537 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538{
4539 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t repsize;
4541 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 Py_UNICODE *uni2;
4543 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t collstartpos = *inpos;
4545 Py_ssize_t collendpos = *inpos+1;
4546 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 char *encoding = "charmap";
4548 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004549 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 /* find all unencodable characters */
4552 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004553 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004554 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004555 int res = encoding_map_lookup(p[collendpos], mapping);
4556 if (res != -1)
4557 break;
4558 ++collendpos;
4559 continue;
4560 }
4561
4562 rep = charmapencode_lookup(p[collendpos], mapping);
4563 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004565 else if (rep!=Py_None) {
4566 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 break;
4568 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004569 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 ++collendpos;
4571 }
4572 /* cache callback name lookup
4573 * (if not done yet, i.e. it's the first error) */
4574 if (*known_errorHandler==-1) {
4575 if ((errors==NULL) || (!strcmp(errors, "strict")))
4576 *known_errorHandler = 1;
4577 else if (!strcmp(errors, "replace"))
4578 *known_errorHandler = 2;
4579 else if (!strcmp(errors, "ignore"))
4580 *known_errorHandler = 3;
4581 else if (!strcmp(errors, "xmlcharrefreplace"))
4582 *known_errorHandler = 4;
4583 else
4584 *known_errorHandler = 0;
4585 }
4586 switch (*known_errorHandler) {
4587 case 1: /* strict */
4588 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4589 return -1;
4590 case 2: /* replace */
4591 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4592 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004593 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 return -1;
4595 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004596 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4598 return -1;
4599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 }
4601 /* fall through */
4602 case 3: /* ignore */
4603 *inpos = collendpos;
4604 break;
4605 case 4: /* xmlcharrefreplace */
4606 /* generate replacement (temporarily (mis)uses p) */
4607 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4608 char buffer[2+29+1+1];
4609 char *cp;
4610 sprintf(buffer, "&#%d;", (int)p[collpos]);
4611 for (cp = buffer; *cp; ++cp) {
4612 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004613 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617 return -1;
4618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 }
4620 }
4621 *inpos = collendpos;
4622 break;
4623 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004624 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 encoding, reason, p, size, exceptionObject,
4626 collstartpos, collendpos, &newpos);
4627 if (repunicode == NULL)
4628 return -1;
4629 /* generate replacement */
4630 repsize = PyUnicode_GET_SIZE(repunicode);
4631 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4632 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004633 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 return -1;
4635 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004636 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4639 return -1;
4640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 }
4642 *inpos = newpos;
4643 Py_DECREF(repunicode);
4644 }
4645 return 0;
4646}
4647
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 PyObject *mapping,
4651 const char *errors)
4652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* output object */
4654 PyObject *res = NULL;
4655 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004656 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 PyObject *errorHandler = NULL;
4660 PyObject *exc = NULL;
4661 /* the following variable is used for caching string comparisons
4662 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4663 * 3=ignore, 4=xmlcharrefreplace */
4664 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665
4666 /* Default to Latin-1 */
4667 if (mapping == NULL)
4668 return PyUnicode_EncodeLatin1(p, size, errors);
4669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 /* allocate enough for a simple encoding without
4671 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004672 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 if (res == NULL)
4674 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004675 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 while (inpos<size) {
4679 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004680 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004681 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004683 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 if (charmap_encoding_error(p, size, &inpos, mapping,
4685 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004686 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004687 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004688 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 else
4692 /* done with this character => adjust input position */
4693 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004697 if (respos<PyBytes_GET_SIZE(res))
4698 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 Py_XDECREF(exc);
4701 Py_XDECREF(errorHandler);
4702 return res;
4703
4704 onError:
4705 Py_XDECREF(res);
4706 Py_XDECREF(exc);
4707 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 return NULL;
4709}
4710
4711PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4712 PyObject *mapping)
4713{
4714 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4715 PyErr_BadArgument();
4716 return NULL;
4717 }
4718 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4719 PyUnicode_GET_SIZE(unicode),
4720 mapping,
4721 NULL);
4722}
4723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724/* create or adjust a UnicodeTranslateError */
4725static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004726 const Py_UNICODE *unicode, Py_ssize_t size,
4727 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 if (*exceptionObject == NULL) {
4731 *exceptionObject = PyUnicodeTranslateError_Create(
4732 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 }
4734 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4736 goto onError;
4737 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4738 goto onError;
4739 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4740 goto onError;
4741 return;
4742 onError:
4743 Py_DECREF(*exceptionObject);
4744 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 }
4746}
4747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748/* raises a UnicodeTranslateError */
4749static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 const Py_UNICODE *unicode, Py_ssize_t size,
4751 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 const char *reason)
4753{
4754 make_translate_exception(exceptionObject,
4755 unicode, size, startpos, endpos, reason);
4756 if (*exceptionObject != NULL)
4757 PyCodec_StrictErrors(*exceptionObject);
4758}
4759
4760/* error handling callback helper:
4761 build arguments, call the callback and check the arguments,
4762 put the result into newpos and return the replacement string, which
4763 has to be freed by the caller */
4764static PyObject *unicode_translate_call_errorhandler(const char *errors,
4765 PyObject **errorHandler,
4766 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4768 Py_ssize_t startpos, Py_ssize_t endpos,
4769 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004771 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004773 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 PyObject *restuple;
4775 PyObject *resunicode;
4776
4777 if (*errorHandler == NULL) {
4778 *errorHandler = PyCodec_LookupError(errors);
4779 if (*errorHandler == NULL)
4780 return NULL;
4781 }
4782
4783 make_translate_exception(exceptionObject,
4784 unicode, size, startpos, endpos, reason);
4785 if (*exceptionObject == NULL)
4786 return NULL;
4787
4788 restuple = PyObject_CallFunctionObjArgs(
4789 *errorHandler, *exceptionObject, NULL);
4790 if (restuple == NULL)
4791 return NULL;
4792 if (!PyTuple_Check(restuple)) {
4793 PyErr_Format(PyExc_TypeError, &argparse[4]);
4794 Py_DECREF(restuple);
4795 return NULL;
4796 }
4797 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004798 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 Py_DECREF(restuple);
4800 return NULL;
4801 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 if (i_newpos<0)
4803 *newpos = size+i_newpos;
4804 else
4805 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004806 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004807 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004808 Py_DECREF(restuple);
4809 return NULL;
4810 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 Py_INCREF(resunicode);
4812 Py_DECREF(restuple);
4813 return resunicode;
4814}
4815
4816/* Lookup the character ch in the mapping and put the result in result,
4817 which must be decrefed by the caller.
4818 Return 0 on success, -1 on error */
4819static
4820int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4821{
Christian Heimes217cfd12007-12-02 14:31:20 +00004822 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 PyObject *x;
4824
4825 if (w == NULL)
4826 return -1;
4827 x = PyObject_GetItem(mapping, w);
4828 Py_DECREF(w);
4829 if (x == NULL) {
4830 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4831 /* No mapping found means: use 1:1 mapping. */
4832 PyErr_Clear();
4833 *result = NULL;
4834 return 0;
4835 } else
4836 return -1;
4837 }
4838 else if (x == Py_None) {
4839 *result = x;
4840 return 0;
4841 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004842 else if (PyLong_Check(x)) {
4843 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 long max = PyUnicode_GetMax();
4845 if (value < 0 || value > max) {
4846 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004847 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 Py_DECREF(x);
4849 return -1;
4850 }
4851 *result = x;
4852 return 0;
4853 }
4854 else if (PyUnicode_Check(x)) {
4855 *result = x;
4856 return 0;
4857 }
4858 else {
4859 /* wrong return value */
4860 PyErr_SetString(PyExc_TypeError,
4861 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004862 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 return -1;
4864 }
4865}
4866/* ensure that *outobj is at least requiredsize characters long,
4867if not reallocate and adjust various state variables.
4868Return 0 on success, -1 on error */
4869static
Walter Dörwald4894c302003-10-24 14:25:28 +00004870int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004874 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004876 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004878 if (requiredsize < 2 * oldsize)
4879 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004880 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 return -1;
4882 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 }
4884 return 0;
4885}
4886/* lookup the character, put the result in the output string and adjust
4887 various state variables. Return a new reference to the object that
4888 was put in the output buffer in *result, or Py_None, if the mapping was
4889 undefined (in which case no character was written).
4890 The called must decref result.
4891 Return 0 on success, -1 on error. */
4892static
Walter Dörwald4894c302003-10-24 14:25:28 +00004893int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004895 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896{
Walter Dörwald4894c302003-10-24 14:25:28 +00004897 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 return -1;
4899 if (*res==NULL) {
4900 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004901 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 }
4903 else if (*res==Py_None)
4904 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004905 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004907 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 }
4909 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004910 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 if (repsize==1) {
4912 /* no overflow check, because we know that the space is enough */
4913 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4914 }
4915 else if (repsize!=0) {
4916 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004917 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004918 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004919 repsize - 1;
4920 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 return -1;
4922 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4923 *outp += repsize;
4924 }
4925 }
4926 else
4927 return -1;
4928 return 0;
4929}
4930
4931PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004932 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 PyObject *mapping,
4934 const char *errors)
4935{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 /* output object */
4937 PyObject *res = NULL;
4938 /* pointers to the beginning and end+1 of input */
4939 const Py_UNICODE *startp = p;
4940 const Py_UNICODE *endp = p + size;
4941 /* pointer into the output */
4942 Py_UNICODE *str;
4943 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004944 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 char *reason = "character maps to <undefined>";
4946 PyObject *errorHandler = NULL;
4947 PyObject *exc = NULL;
4948 /* the following variable is used for caching string comparisons
4949 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4950 * 3=ignore, 4=xmlcharrefreplace */
4951 int known_errorHandler = -1;
4952
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 if (mapping == NULL) {
4954 PyErr_BadArgument();
4955 return NULL;
4956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957
4958 /* allocate enough for a simple 1:1 translation without
4959 replacements, if we need more, we'll resize */
4960 res = PyUnicode_FromUnicode(NULL, size);
4961 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004962 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 return res;
4965 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 while (p<endp) {
4968 /* try to encode it */
4969 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004970 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 goto onError;
4973 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004974 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 if (x!=Py_None) /* it worked => adjust input pointer */
4976 ++p;
4977 else { /* untranslatable character */
4978 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004979 Py_ssize_t repsize;
4980 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 Py_UNICODE *uni2;
4982 /* startpos for collecting untranslatable chars */
4983 const Py_UNICODE *collstart = p;
4984 const Py_UNICODE *collend = p+1;
4985 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 /* find all untranslatable characters */
4988 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004989 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990 goto onError;
4991 Py_XDECREF(x);
4992 if (x!=Py_None)
4993 break;
4994 ++collend;
4995 }
4996 /* cache callback name lookup
4997 * (if not done yet, i.e. it's the first error) */
4998 if (known_errorHandler==-1) {
4999 if ((errors==NULL) || (!strcmp(errors, "strict")))
5000 known_errorHandler = 1;
5001 else if (!strcmp(errors, "replace"))
5002 known_errorHandler = 2;
5003 else if (!strcmp(errors, "ignore"))
5004 known_errorHandler = 3;
5005 else if (!strcmp(errors, "xmlcharrefreplace"))
5006 known_errorHandler = 4;
5007 else
5008 known_errorHandler = 0;
5009 }
5010 switch (known_errorHandler) {
5011 case 1: /* strict */
5012 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5013 goto onError;
5014 case 2: /* replace */
5015 /* No need to check for space, this is a 1:1 replacement */
5016 for (coll = collstart; coll<collend; ++coll)
5017 *str++ = '?';
5018 /* fall through */
5019 case 3: /* ignore */
5020 p = collend;
5021 break;
5022 case 4: /* xmlcharrefreplace */
5023 /* generate replacement (temporarily (mis)uses p) */
5024 for (p = collstart; p < collend; ++p) {
5025 char buffer[2+29+1+1];
5026 char *cp;
5027 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005028 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5030 goto onError;
5031 for (cp = buffer; *cp; ++cp)
5032 *str++ = *cp;
5033 }
5034 p = collend;
5035 break;
5036 default:
5037 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5038 reason, startp, size, &exc,
5039 collstart-startp, collend-startp, &newpos);
5040 if (repunicode == NULL)
5041 goto onError;
5042 /* generate replacement */
5043 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005044 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5046 Py_DECREF(repunicode);
5047 goto onError;
5048 }
5049 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5050 *str++ = *uni2;
5051 p = startp + newpos;
5052 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 }
5054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 /* Resize if we allocated to much */
5057 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005058 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005059 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 }
5062 Py_XDECREF(exc);
5063 Py_XDECREF(errorHandler);
5064 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005066 onError:
5067 Py_XDECREF(res);
5068 Py_XDECREF(exc);
5069 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 return NULL;
5071}
5072
5073PyObject *PyUnicode_Translate(PyObject *str,
5074 PyObject *mapping,
5075 const char *errors)
5076{
5077 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005078
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 str = PyUnicode_FromObject(str);
5080 if (str == NULL)
5081 goto onError;
5082 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5083 PyUnicode_GET_SIZE(str),
5084 mapping,
5085 errors);
5086 Py_DECREF(str);
5087 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 onError:
5090 Py_XDECREF(str);
5091 return NULL;
5092}
Tim Petersced69f82003-09-16 20:30:58 +00005093
Guido van Rossum9e896b32000-04-05 20:11:21 +00005094/* --- Decimal Encoder ---------------------------------------------------- */
5095
5096int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005097 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005098 char *output,
5099 const char *errors)
5100{
5101 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 PyObject *errorHandler = NULL;
5103 PyObject *exc = NULL;
5104 const char *encoding = "decimal";
5105 const char *reason = "invalid decimal Unicode string";
5106 /* the following variable is used for caching string comparisons
5107 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5108 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005109
5110 if (output == NULL) {
5111 PyErr_BadArgument();
5112 return -1;
5113 }
5114
5115 p = s;
5116 end = s + length;
5117 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005119 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005120 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121 Py_ssize_t repsize;
5122 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 Py_UNICODE *uni2;
5124 Py_UNICODE *collstart;
5125 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005126
Guido van Rossum9e896b32000-04-05 20:11:21 +00005127 if (Py_UNICODE_ISSPACE(ch)) {
5128 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130 continue;
5131 }
5132 decimal = Py_UNICODE_TODECIMAL(ch);
5133 if (decimal >= 0) {
5134 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005136 continue;
5137 }
Guido van Rossumba477042000-04-06 18:18:10 +00005138 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005139 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005141 continue;
5142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 /* All other characters are considered unencodable */
5144 collstart = p;
5145 collend = p+1;
5146 while (collend < end) {
5147 if ((0 < *collend && *collend < 256) ||
5148 !Py_UNICODE_ISSPACE(*collend) ||
5149 Py_UNICODE_TODECIMAL(*collend))
5150 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 /* cache callback name lookup
5153 * (if not done yet, i.e. it's the first error) */
5154 if (known_errorHandler==-1) {
5155 if ((errors==NULL) || (!strcmp(errors, "strict")))
5156 known_errorHandler = 1;
5157 else if (!strcmp(errors, "replace"))
5158 known_errorHandler = 2;
5159 else if (!strcmp(errors, "ignore"))
5160 known_errorHandler = 3;
5161 else if (!strcmp(errors, "xmlcharrefreplace"))
5162 known_errorHandler = 4;
5163 else
5164 known_errorHandler = 0;
5165 }
5166 switch (known_errorHandler) {
5167 case 1: /* strict */
5168 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5169 goto onError;
5170 case 2: /* replace */
5171 for (p = collstart; p < collend; ++p)
5172 *output++ = '?';
5173 /* fall through */
5174 case 3: /* ignore */
5175 p = collend;
5176 break;
5177 case 4: /* xmlcharrefreplace */
5178 /* generate replacement (temporarily (mis)uses p) */
5179 for (p = collstart; p < collend; ++p)
5180 output += sprintf(output, "&#%d;", (int)*p);
5181 p = collend;
5182 break;
5183 default:
5184 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5185 encoding, reason, s, length, &exc,
5186 collstart-s, collend-s, &newpos);
5187 if (repunicode == NULL)
5188 goto onError;
5189 /* generate replacement */
5190 repsize = PyUnicode_GET_SIZE(repunicode);
5191 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5192 Py_UNICODE ch = *uni2;
5193 if (Py_UNICODE_ISSPACE(ch))
5194 *output++ = ' ';
5195 else {
5196 decimal = Py_UNICODE_TODECIMAL(ch);
5197 if (decimal >= 0)
5198 *output++ = '0' + decimal;
5199 else if (0 < ch && ch < 256)
5200 *output++ = (char)ch;
5201 else {
5202 Py_DECREF(repunicode);
5203 raise_encode_exception(&exc, encoding,
5204 s, length, collstart-s, collend-s, reason);
5205 goto onError;
5206 }
5207 }
5208 }
5209 p = s + newpos;
5210 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005211 }
5212 }
5213 /* 0-terminate the output string */
5214 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 Py_XDECREF(exc);
5216 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005217 return 0;
5218
5219 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 Py_XDECREF(exc);
5221 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005222 return -1;
5223}
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225/* --- Helpers ------------------------------------------------------------ */
5226
Eric Smith8c663262007-08-25 02:26:07 +00005227#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005228#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005229#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005230/* Include _ParseTupleFinds from find.h */
5231#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005232#include "stringlib/find.h"
5233#include "stringlib/partition.h"
5234
Eric Smith5807c412008-05-11 21:00:57 +00005235#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5236#include "stringlib/localeutil.h"
5237
Thomas Wouters477c8d52006-05-27 19:21:47 +00005238/* helper macro to fixup start/end slice values */
5239#define FIX_START_END(obj) \
5240 if (start < 0) \
5241 start += (obj)->length; \
5242 if (start < 0) \
5243 start = 0; \
5244 if (end > (obj)->length) \
5245 end = (obj)->length; \
5246 if (end < 0) \
5247 end += (obj)->length; \
5248 if (end < 0) \
5249 end = 0;
5250
Martin v. Löwis18e16552006-02-15 17:27:45 +00005251Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005252 PyObject *substr,
5253 Py_ssize_t start,
5254 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005256 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005257 PyUnicodeObject* str_obj;
5258 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005259
Thomas Wouters477c8d52006-05-27 19:21:47 +00005260 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5261 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005263 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5264 if (!sub_obj) {
5265 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 return -1;
5267 }
Tim Petersced69f82003-09-16 20:30:58 +00005268
Thomas Wouters477c8d52006-05-27 19:21:47 +00005269 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005270
Thomas Wouters477c8d52006-05-27 19:21:47 +00005271 result = stringlib_count(
5272 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5273 );
5274
5275 Py_DECREF(sub_obj);
5276 Py_DECREF(str_obj);
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 return result;
5279}
5280
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005282 PyObject *sub,
5283 Py_ssize_t start,
5284 Py_ssize_t end,
5285 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005287 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005290 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005291 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005292 sub = PyUnicode_FromObject(sub);
5293 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005294 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005295 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 }
Tim Petersced69f82003-09-16 20:30:58 +00005297
Thomas Wouters477c8d52006-05-27 19:21:47 +00005298 if (direction > 0)
5299 result = stringlib_find_slice(
5300 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5301 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5302 start, end
5303 );
5304 else
5305 result = stringlib_rfind_slice(
5306 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5307 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5308 start, end
5309 );
5310
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005312 Py_DECREF(sub);
5313
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 return result;
5315}
5316
Tim Petersced69f82003-09-16 20:30:58 +00005317static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318int tailmatch(PyUnicodeObject *self,
5319 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005320 Py_ssize_t start,
5321 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 int direction)
5323{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 if (substring->length == 0)
5325 return 1;
5326
Thomas Wouters477c8d52006-05-27 19:21:47 +00005327 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328
5329 end -= substring->length;
5330 if (end < start)
5331 return 0;
5332
5333 if (direction > 0) {
5334 if (Py_UNICODE_MATCH(self, end, substring))
5335 return 1;
5336 } else {
5337 if (Py_UNICODE_MATCH(self, start, substring))
5338 return 1;
5339 }
5340
5341 return 0;
5342}
5343
Martin v. Löwis18e16552006-02-15 17:27:45 +00005344Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t start,
5347 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 int direction)
5349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 str = PyUnicode_FromObject(str);
5353 if (str == NULL)
5354 return -1;
5355 substr = PyUnicode_FromObject(substr);
5356 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005357 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 return -1;
5359 }
Tim Petersced69f82003-09-16 20:30:58 +00005360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 result = tailmatch((PyUnicodeObject *)str,
5362 (PyUnicodeObject *)substr,
5363 start, end, direction);
5364 Py_DECREF(str);
5365 Py_DECREF(substr);
5366 return result;
5367}
5368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369/* Apply fixfct filter to the Unicode object self and return a
5370 reference to the modified object */
5371
Tim Petersced69f82003-09-16 20:30:58 +00005372static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373PyObject *fixup(PyUnicodeObject *self,
5374 int (*fixfct)(PyUnicodeObject *s))
5375{
5376
5377 PyUnicodeObject *u;
5378
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005379 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 if (u == NULL)
5381 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005382
5383 Py_UNICODE_COPY(u->str, self->str, self->length);
5384
Tim Peters7a29bd52001-09-12 03:03:31 +00005385 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 /* fixfct should return TRUE if it modified the buffer. If
5387 FALSE, return a reference to the original buffer instead
5388 (to save space, not time) */
5389 Py_INCREF(self);
5390 Py_DECREF(u);
5391 return (PyObject*) self;
5392 }
5393 return (PyObject*) u;
5394}
5395
Tim Petersced69f82003-09-16 20:30:58 +00005396static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397int fixupper(PyUnicodeObject *self)
5398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 Py_UNICODE *s = self->str;
5401 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005402
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 while (len-- > 0) {
5404 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 ch = Py_UNICODE_TOUPPER(*s);
5407 if (ch != *s) {
5408 status = 1;
5409 *s = ch;
5410 }
5411 s++;
5412 }
5413
5414 return status;
5415}
5416
Tim Petersced69f82003-09-16 20:30:58 +00005417static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418int fixlower(PyUnicodeObject *self)
5419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 Py_UNICODE *s = self->str;
5422 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 while (len-- > 0) {
5425 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 ch = Py_UNICODE_TOLOWER(*s);
5428 if (ch != *s) {
5429 status = 1;
5430 *s = ch;
5431 }
5432 s++;
5433 }
5434
5435 return status;
5436}
5437
Tim Petersced69f82003-09-16 20:30:58 +00005438static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439int fixswapcase(PyUnicodeObject *self)
5440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005441 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 Py_UNICODE *s = self->str;
5443 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 while (len-- > 0) {
5446 if (Py_UNICODE_ISUPPER(*s)) {
5447 *s = Py_UNICODE_TOLOWER(*s);
5448 status = 1;
5449 } else if (Py_UNICODE_ISLOWER(*s)) {
5450 *s = Py_UNICODE_TOUPPER(*s);
5451 status = 1;
5452 }
5453 s++;
5454 }
5455
5456 return status;
5457}
5458
Tim Petersced69f82003-09-16 20:30:58 +00005459static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460int fixcapitalize(PyUnicodeObject *self)
5461{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005463 Py_UNICODE *s = self->str;
5464 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005465
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005466 if (len == 0)
5467 return 0;
5468 if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005472 s++;
5473 while (--len > 0) {
5474 if (Py_UNICODE_ISUPPER(*s)) {
5475 *s = Py_UNICODE_TOLOWER(*s);
5476 status = 1;
5477 }
5478 s++;
5479 }
5480 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481}
5482
5483static
5484int fixtitle(PyUnicodeObject *self)
5485{
5486 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5487 register Py_UNICODE *e;
5488 int previous_is_cased;
5489
5490 /* Shortcut for single character strings */
5491 if (PyUnicode_GET_SIZE(self) == 1) {
5492 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5493 if (*p != ch) {
5494 *p = ch;
5495 return 1;
5496 }
5497 else
5498 return 0;
5499 }
Tim Petersced69f82003-09-16 20:30:58 +00005500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 e = p + PyUnicode_GET_SIZE(self);
5502 previous_is_cased = 0;
5503 for (; p < e; p++) {
5504 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005505
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 if (previous_is_cased)
5507 *p = Py_UNICODE_TOLOWER(ch);
5508 else
5509 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005510
5511 if (Py_UNICODE_ISLOWER(ch) ||
5512 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 Py_UNICODE_ISTITLE(ch))
5514 previous_is_cased = 1;
5515 else
5516 previous_is_cased = 0;
5517 }
5518 return 1;
5519}
5520
Tim Peters8ce9f162004-08-27 01:49:32 +00005521PyObject *
5522PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523{
Tim Peters8ce9f162004-08-27 01:49:32 +00005524 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005525 const Py_UNICODE blank = ' ';
5526 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005527 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005529 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5530 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5532 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005533 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005535 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536
Tim Peters05eba1f2004-08-27 21:32:02 +00005537 fseq = PySequence_Fast(seq, "");
5538 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005539 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005540 }
5541
Tim Peters91879ab2004-08-27 22:35:44 +00005542 /* Grrrr. A codec may be invoked to convert str objects to
5543 * Unicode, and so it's possible to call back into Python code
5544 * during PyUnicode_FromObject(), and so it's possible for a sick
5545 * codec to change the size of fseq (if seq is a list). Therefore
5546 * we have to keep refetching the size -- can't assume seqlen
5547 * is invariant.
5548 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005549 seqlen = PySequence_Fast_GET_SIZE(fseq);
5550 /* If empty sequence, return u"". */
5551 if (seqlen == 0) {
5552 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5553 goto Done;
5554 }
5555 /* If singleton sequence with an exact Unicode, return that. */
5556 if (seqlen == 1) {
5557 item = PySequence_Fast_GET_ITEM(fseq, 0);
5558 if (PyUnicode_CheckExact(item)) {
5559 Py_INCREF(item);
5560 res = (PyUnicodeObject *)item;
5561 goto Done;
5562 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005563 }
5564
Tim Peters05eba1f2004-08-27 21:32:02 +00005565 /* At least two items to join, or one that isn't exact Unicode. */
5566 if (seqlen > 1) {
5567 /* Set up sep and seplen -- they're needed. */
5568 if (separator == NULL) {
5569 sep = &blank;
5570 seplen = 1;
5571 }
5572 else {
5573 internal_separator = PyUnicode_FromObject(separator);
5574 if (internal_separator == NULL)
5575 goto onError;
5576 sep = PyUnicode_AS_UNICODE(internal_separator);
5577 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005578 /* In case PyUnicode_FromObject() mutated seq. */
5579 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005580 }
5581 }
5582
5583 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005584 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005585 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005586 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 res_p = PyUnicode_AS_UNICODE(res);
5588 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005589
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005591 Py_ssize_t itemlen;
5592 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005593
5594 item = PySequence_Fast_GET_ITEM(fseq, i);
5595 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005596 if (!PyUnicode_Check(item)) {
5597 PyErr_Format(PyExc_TypeError,
5598 "sequence item %zd: expected str instance,"
5599 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005600 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005601 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005602 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005603 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005604 if (item == NULL)
5605 goto onError;
5606 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005607
Tim Peters91879ab2004-08-27 22:35:44 +00005608 /* In case PyUnicode_FromObject() mutated seq. */
5609 seqlen = PySequence_Fast_GET_SIZE(fseq);
5610
Tim Peters8ce9f162004-08-27 01:49:32 +00005611 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005614 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005615 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005616 if (i < seqlen - 1) {
5617 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005618 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005619 goto Overflow;
5620 }
5621 if (new_res_used > res_alloc) {
5622 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005623 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005626 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005627 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005628 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005629 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005631 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005632 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005634
5635 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005636 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005637 res_p += itemlen;
5638 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005639 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005640 res_p += seplen;
5641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005643 res_used = new_res_used;
5644 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005645
Tim Peters05eba1f2004-08-27 21:32:02 +00005646 /* Shrink res to match the used area; this probably can't fail,
5647 * but it's cheap to check.
5648 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005649 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005650 goto onError;
5651
5652 Done:
5653 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005654 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 return (PyObject *)res;
5656
Tim Peters8ce9f162004-08-27 01:49:32 +00005657 Overflow:
5658 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005659 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005660 Py_DECREF(item);
5661 /* fall through */
5662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005664 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005666 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 return NULL;
5668}
5669
Tim Petersced69f82003-09-16 20:30:58 +00005670static
5671PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005672 Py_ssize_t left,
5673 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 Py_UNICODE fill)
5675{
5676 PyUnicodeObject *u;
5677
5678 if (left < 0)
5679 left = 0;
5680 if (right < 0)
5681 right = 0;
5682
Tim Peters7a29bd52001-09-12 03:03:31 +00005683 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 Py_INCREF(self);
5685 return self;
5686 }
5687
5688 u = _PyUnicode_New(left + self->length + right);
5689 if (u) {
5690 if (left)
5691 Py_UNICODE_FILL(u->str, fill, left);
5692 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5693 if (right)
5694 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5695 }
5696
5697 return u;
5698}
5699
5700#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005701 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 if (!str) \
5703 goto onError; \
5704 if (PyList_Append(list, str)) { \
5705 Py_DECREF(str); \
5706 goto onError; \
5707 } \
5708 else \
5709 Py_DECREF(str);
5710
5711static
5712PyObject *split_whitespace(PyUnicodeObject *self,
5713 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005714 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005716 register Py_ssize_t i;
5717 register Py_ssize_t j;
5718 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005720 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
5722 for (i = j = 0; i < len; ) {
5723 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005724 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 i++;
5726 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005727 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 i++;
5729 if (j < i) {
5730 if (maxcount-- <= 0)
5731 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005732 SPLIT_APPEND(buf, j, i);
5733 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 i++;
5735 j = i;
5736 }
5737 }
5738 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005739 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 }
5741 return list;
5742
5743 onError:
5744 Py_DECREF(list);
5745 return NULL;
5746}
5747
5748PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005749 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 register Py_ssize_t i;
5752 register Py_ssize_t j;
5753 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 PyObject *list;
5755 PyObject *str;
5756 Py_UNICODE *data;
5757
5758 string = PyUnicode_FromObject(string);
5759 if (string == NULL)
5760 return NULL;
5761 data = PyUnicode_AS_UNICODE(string);
5762 len = PyUnicode_GET_SIZE(string);
5763
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 list = PyList_New(0);
5765 if (!list)
5766 goto onError;
5767
5768 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005769 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005770
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005772 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
5775 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005776 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 if (i < len) {
5778 if (data[i] == '\r' && i + 1 < len &&
5779 data[i+1] == '\n')
5780 i += 2;
5781 else
5782 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005783 if (keepends)
5784 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 }
Guido van Rossum86662912000-04-11 15:38:46 +00005786 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 j = i;
5788 }
5789 if (j < len) {
5790 SPLIT_APPEND(data, j, len);
5791 }
5792
5793 Py_DECREF(string);
5794 return list;
5795
5796 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005797 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 Py_DECREF(string);
5799 return NULL;
5800}
5801
Tim Petersced69f82003-09-16 20:30:58 +00005802static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803PyObject *split_char(PyUnicodeObject *self,
5804 PyObject *list,
5805 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005806 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005808 register Py_ssize_t i;
5809 register Py_ssize_t j;
5810 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005812 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813
5814 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005815 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 if (maxcount-- <= 0)
5817 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005818 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 i = j = i + 1;
5820 } else
5821 i++;
5822 }
5823 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005824 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 }
5826 return list;
5827
5828 onError:
5829 Py_DECREF(list);
5830 return NULL;
5831}
5832
Tim Petersced69f82003-09-16 20:30:58 +00005833static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834PyObject *split_substring(PyUnicodeObject *self,
5835 PyObject *list,
5836 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 register Py_ssize_t i;
5840 register Py_ssize_t j;
5841 Py_ssize_t len = self->length;
5842 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 PyObject *str;
5844
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005845 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 if (Py_UNICODE_MATCH(self, i, substring)) {
5847 if (maxcount-- <= 0)
5848 break;
5849 SPLIT_APPEND(self->str, j, i);
5850 i = j = i + sublen;
5851 } else
5852 i++;
5853 }
5854 if (j <= len) {
5855 SPLIT_APPEND(self->str, j, len);
5856 }
5857 return list;
5858
5859 onError:
5860 Py_DECREF(list);
5861 return NULL;
5862}
5863
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864static
5865PyObject *rsplit_whitespace(PyUnicodeObject *self,
5866 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 register Py_ssize_t i;
5870 register Py_ssize_t j;
5871 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005873 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874
5875 for (i = j = len - 1; i >= 0; ) {
5876 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005877 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005878 i--;
5879 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005880 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005881 i--;
5882 if (j > i) {
5883 if (maxcount-- <= 0)
5884 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005885 SPLIT_APPEND(buf, i + 1, j + 1);
5886 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005887 i--;
5888 j = i;
5889 }
5890 }
5891 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005892 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005893 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005894 if (PyList_Reverse(list) < 0)
5895 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005896 return list;
5897
5898 onError:
5899 Py_DECREF(list);
5900 return NULL;
5901}
5902
5903static
5904PyObject *rsplit_char(PyUnicodeObject *self,
5905 PyObject *list,
5906 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005907 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005908{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005909 register Py_ssize_t i;
5910 register Py_ssize_t j;
5911 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005912 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005913 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914
5915 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005916 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005917 if (maxcount-- <= 0)
5918 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005919 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005920 j = i = i - 1;
5921 } else
5922 i--;
5923 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005924 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005925 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005926 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005927 if (PyList_Reverse(list) < 0)
5928 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005929 return list;
5930
5931 onError:
5932 Py_DECREF(list);
5933 return NULL;
5934}
5935
5936static
5937PyObject *rsplit_substring(PyUnicodeObject *self,
5938 PyObject *list,
5939 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005940 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005941{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005942 register Py_ssize_t i;
5943 register Py_ssize_t j;
5944 Py_ssize_t len = self->length;
5945 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946 PyObject *str;
5947
5948 for (i = len - sublen, j = len; i >= 0; ) {
5949 if (Py_UNICODE_MATCH(self, i, substring)) {
5950 if (maxcount-- <= 0)
5951 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005952 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005953 j = i;
5954 i -= sublen;
5955 } else
5956 i--;
5957 }
5958 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005959 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005960 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 if (PyList_Reverse(list) < 0)
5962 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005963 return list;
5964
5965 onError:
5966 Py_DECREF(list);
5967 return NULL;
5968}
5969
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970#undef SPLIT_APPEND
5971
5972static
5973PyObject *split(PyUnicodeObject *self,
5974 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005975 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
5977 PyObject *list;
5978
5979 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005980 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
5982 list = PyList_New(0);
5983 if (!list)
5984 return NULL;
5985
5986 if (substring == NULL)
5987 return split_whitespace(self,list,maxcount);
5988
5989 else if (substring->length == 1)
5990 return split_char(self,list,substring->str[0],maxcount);
5991
5992 else if (substring->length == 0) {
5993 Py_DECREF(list);
5994 PyErr_SetString(PyExc_ValueError, "empty separator");
5995 return NULL;
5996 }
5997 else
5998 return split_substring(self,list,substring,maxcount);
5999}
6000
Tim Petersced69f82003-09-16 20:30:58 +00006001static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006002PyObject *rsplit(PyUnicodeObject *self,
6003 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006004 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006005{
6006 PyObject *list;
6007
6008 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006009 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010
6011 list = PyList_New(0);
6012 if (!list)
6013 return NULL;
6014
6015 if (substring == NULL)
6016 return rsplit_whitespace(self,list,maxcount);
6017
6018 else if (substring->length == 1)
6019 return rsplit_char(self,list,substring->str[0],maxcount);
6020
6021 else if (substring->length == 0) {
6022 Py_DECREF(list);
6023 PyErr_SetString(PyExc_ValueError, "empty separator");
6024 return NULL;
6025 }
6026 else
6027 return rsplit_substring(self,list,substring,maxcount);
6028}
6029
6030static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031PyObject *replace(PyUnicodeObject *self,
6032 PyUnicodeObject *str1,
6033 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035{
6036 PyUnicodeObject *u;
6037
6038 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006039 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 if (str1->length == str2->length) {
6042 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006043 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006044 if (str1->length == 1) {
6045 /* replace characters */
6046 Py_UNICODE u1, u2;
6047 if (!findchar(self->str, self->length, str1->str[0]))
6048 goto nothing;
6049 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6050 if (!u)
6051 return NULL;
6052 Py_UNICODE_COPY(u->str, self->str, self->length);
6053 u1 = str1->str[0];
6054 u2 = str2->str[0];
6055 for (i = 0; i < u->length; i++)
6056 if (u->str[i] == u1) {
6057 if (--maxcount < 0)
6058 break;
6059 u->str[i] = u2;
6060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006062 i = fastsearch(
6063 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006065 if (i < 0)
6066 goto nothing;
6067 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6068 if (!u)
6069 return NULL;
6070 Py_UNICODE_COPY(u->str, self->str, self->length);
6071 while (i <= self->length - str1->length)
6072 if (Py_UNICODE_MATCH(self, i, str1)) {
6073 if (--maxcount < 0)
6074 break;
6075 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6076 i += str1->length;
6077 } else
6078 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006081
6082 Py_ssize_t n, i, j, e;
6083 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 Py_UNICODE *p;
6085
6086 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006087 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 if (n > maxcount)
6089 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006090 if (n == 0)
6091 goto nothing;
6092 /* new_size = self->length + n * (str2->length - str1->length)); */
6093 delta = (str2->length - str1->length);
6094 if (delta == 0) {
6095 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006097 product = n * (str2->length - str1->length);
6098 if ((product / (str2->length - str1->length)) != n) {
6099 PyErr_SetString(PyExc_OverflowError,
6100 "replace string is too long");
6101 return NULL;
6102 }
6103 new_size = self->length + product;
6104 if (new_size < 0) {
6105 PyErr_SetString(PyExc_OverflowError,
6106 "replace string is too long");
6107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
6109 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006110 u = _PyUnicode_New(new_size);
6111 if (!u)
6112 return NULL;
6113 i = 0;
6114 p = u->str;
6115 e = self->length - str1->length;
6116 if (str1->length > 0) {
6117 while (n-- > 0) {
6118 /* look for next match */
6119 j = i;
6120 while (j <= e) {
6121 if (Py_UNICODE_MATCH(self, j, str1))
6122 break;
6123 j++;
6124 }
6125 if (j > i) {
6126 if (j > e)
6127 break;
6128 /* copy unchanged part [i:j] */
6129 Py_UNICODE_COPY(p, self->str+i, j-i);
6130 p += j - i;
6131 }
6132 /* copy substitution string */
6133 if (str2->length > 0) {
6134 Py_UNICODE_COPY(p, str2->str, str2->length);
6135 p += str2->length;
6136 }
6137 i = j + str1->length;
6138 }
6139 if (i < self->length)
6140 /* copy tail [i:] */
6141 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6142 } else {
6143 /* interleave */
6144 while (n > 0) {
6145 Py_UNICODE_COPY(p, str2->str, str2->length);
6146 p += str2->length;
6147 if (--n <= 0)
6148 break;
6149 *p++ = self->str[i++];
6150 }
6151 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006155
6156nothing:
6157 /* nothing to replace; return original string (when possible) */
6158 if (PyUnicode_CheckExact(self)) {
6159 Py_INCREF(self);
6160 return (PyObject *) self;
6161 }
6162 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163}
6164
6165/* --- Unicode Object Methods --------------------------------------------- */
6166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006168"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169\n\
6170Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006171characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
6173static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006174unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 return fixup(self, fixtitle);
6177}
6178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006179PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006180"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181\n\
6182Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006186unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 return fixup(self, fixcapitalize);
6189}
6190
6191#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006192PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006193"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194\n\
6195Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006196normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
6198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006199unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200{
6201 PyObject *list;
6202 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 /* Split into words */
6206 list = split(self, NULL, -1);
6207 if (!list)
6208 return NULL;
6209
6210 /* Capitalize each word */
6211 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6212 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6213 fixcapitalize);
6214 if (item == NULL)
6215 goto onError;
6216 Py_DECREF(PyList_GET_ITEM(list, i));
6217 PyList_SET_ITEM(list, i, item);
6218 }
6219
6220 /* Join the words to form a new string */
6221 item = PyUnicode_Join(NULL, list);
6222
6223onError:
6224 Py_DECREF(list);
6225 return (PyObject *)item;
6226}
6227#endif
6228
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006229/* Argument converter. Coerces to a single unicode character */
6230
6231static int
6232convert_uc(PyObject *obj, void *addr)
6233{
6234 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6235 PyObject *uniobj;
6236 Py_UNICODE *unistr;
6237
6238 uniobj = PyUnicode_FromObject(obj);
6239 if (uniobj == NULL) {
6240 PyErr_SetString(PyExc_TypeError,
6241 "The fill character cannot be converted to Unicode");
6242 return 0;
6243 }
6244 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6245 PyErr_SetString(PyExc_TypeError,
6246 "The fill character must be exactly one character long");
6247 Py_DECREF(uniobj);
6248 return 0;
6249 }
6250 unistr = PyUnicode_AS_UNICODE(uniobj);
6251 *fillcharloc = unistr[0];
6252 Py_DECREF(uniobj);
6253 return 1;
6254}
6255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006256PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006257"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006259Return S centered in a Unicode string of length width. Padding is\n\
6260done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
6262static PyObject *
6263unicode_center(PyUnicodeObject *self, PyObject *args)
6264{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t marg, left;
6266 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006267 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
Thomas Woutersde017742006-02-16 19:34:37 +00006269 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return NULL;
6271
Tim Peters7a29bd52001-09-12 03:03:31 +00006272 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 Py_INCREF(self);
6274 return (PyObject*) self;
6275 }
6276
6277 marg = width - self->length;
6278 left = marg / 2 + (marg & width & 1);
6279
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006280 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281}
6282
Marc-André Lemburge5034372000-08-08 08:04:29 +00006283#if 0
6284
6285/* This code should go into some future Unicode collation support
6286 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006287 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006288
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006289/* speedy UTF-16 code point order comparison */
6290/* gleaned from: */
6291/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6292
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006293static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006294{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006295 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006296 0, 0, 0, 0, 0, 0, 0, 0,
6297 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006298 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006299};
6300
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301static int
6302unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 Py_UNICODE *s1 = str1->str;
6307 Py_UNICODE *s2 = str2->str;
6308
6309 len1 = str1->length;
6310 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006313 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006314
6315 c1 = *s1++;
6316 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006317
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006318 if (c1 > (1<<11) * 26)
6319 c1 += utf16Fixup[c1>>11];
6320 if (c2 > (1<<11) * 26)
6321 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006322 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006323
6324 if (c1 != c2)
6325 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006326
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006327 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 }
6329
6330 return (len1 < len2) ? -1 : (len1 != len2);
6331}
6332
Marc-André Lemburge5034372000-08-08 08:04:29 +00006333#else
6334
6335static int
6336unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006338 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006339
6340 Py_UNICODE *s1 = str1->str;
6341 Py_UNICODE *s2 = str2->str;
6342
6343 len1 = str1->length;
6344 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006345
Marc-André Lemburge5034372000-08-08 08:04:29 +00006346 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006347 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006348
Fredrik Lundh45714e92001-06-26 16:39:36 +00006349 c1 = *s1++;
6350 c2 = *s2++;
6351
6352 if (c1 != c2)
6353 return (c1 < c2) ? -1 : 1;
6354
Marc-André Lemburge5034372000-08-08 08:04:29 +00006355 len1--; len2--;
6356 }
6357
6358 return (len1 < len2) ? -1 : (len1 != len2);
6359}
6360
6361#endif
6362
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363int PyUnicode_Compare(PyObject *left,
6364 PyObject *right)
6365{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006366 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6367 return unicode_compare((PyUnicodeObject *)left,
6368 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006369 PyErr_Format(PyExc_TypeError,
6370 "Can't compare %.100s and %.100s",
6371 left->ob_type->tp_name,
6372 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 return -1;
6374}
6375
Martin v. Löwis5b222132007-06-10 09:51:05 +00006376int
6377PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6378{
6379 int i;
6380 Py_UNICODE *id;
6381 assert(PyUnicode_Check(uni));
6382 id = PyUnicode_AS_UNICODE(uni);
6383 /* Compare Unicode string and source character set string */
6384 for (i = 0; id[i] && str[i]; i++)
6385 if (id[i] != str[i])
6386 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6387 if (id[i])
6388 return 1; /* uni is longer */
6389 if (str[i])
6390 return -1; /* str is longer */
6391 return 0;
6392}
6393
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006394PyObject *PyUnicode_RichCompare(PyObject *left,
6395 PyObject *right,
6396 int op)
6397{
6398 int result;
6399
6400 result = PyUnicode_Compare(left, right);
6401 if (result == -1 && PyErr_Occurred())
6402 goto onError;
6403
6404 /* Convert the return value to a Boolean */
6405 switch (op) {
6406 case Py_EQ:
6407 result = (result == 0);
6408 break;
6409 case Py_NE:
6410 result = (result != 0);
6411 break;
6412 case Py_LE:
6413 result = (result <= 0);
6414 break;
6415 case Py_GE:
6416 result = (result >= 0);
6417 break;
6418 case Py_LT:
6419 result = (result == -1);
6420 break;
6421 case Py_GT:
6422 result = (result == 1);
6423 break;
6424 }
6425 return PyBool_FromLong(result);
6426
6427 onError:
6428
6429 /* Standard case
6430
6431 Type errors mean that PyUnicode_FromObject() could not convert
6432 one of the arguments (usually the right hand side) to Unicode,
6433 ie. we can't handle the comparison request. However, it is
6434 possible that the other object knows a comparison method, which
6435 is why we return Py_NotImplemented to give the other object a
6436 chance.
6437
6438 */
6439 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6440 PyErr_Clear();
6441 Py_INCREF(Py_NotImplemented);
6442 return Py_NotImplemented;
6443 }
6444 if (op != Py_EQ && op != Py_NE)
6445 return NULL;
6446
6447 /* Equality comparison.
6448
6449 This is a special case: we silence any PyExc_UnicodeDecodeError
6450 and instead turn it into a PyErr_UnicodeWarning.
6451
6452 */
6453 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6454 return NULL;
6455 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006456 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6457 (op == Py_EQ) ?
6458 "Unicode equal comparison "
6459 "failed to convert both arguments to Unicode - "
6460 "interpreting them as being unequal"
6461 :
6462 "Unicode unequal comparison "
6463 "failed to convert both arguments to Unicode - "
6464 "interpreting them as being unequal",
6465 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006466 return NULL;
6467 result = (op == Py_NE);
6468 return PyBool_FromLong(result);
6469}
6470
Guido van Rossum403d68b2000-03-13 15:55:09 +00006471int PyUnicode_Contains(PyObject *container,
6472 PyObject *element)
6473{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006474 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006476
6477 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 sub = PyUnicode_FromObject(element);
6479 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006480 PyErr_Format(PyExc_TypeError,
6481 "'in <string>' requires string as left operand, not %s",
6482 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006483 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006484 }
6485
Thomas Wouters477c8d52006-05-27 19:21:47 +00006486 str = PyUnicode_FromObject(container);
6487 if (!str) {
6488 Py_DECREF(sub);
6489 return -1;
6490 }
6491
6492 result = stringlib_contains_obj(str, sub);
6493
6494 Py_DECREF(str);
6495 Py_DECREF(sub);
6496
Guido van Rossum403d68b2000-03-13 15:55:09 +00006497 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006498}
6499
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500/* Concat to string or Unicode object giving a new Unicode object. */
6501
6502PyObject *PyUnicode_Concat(PyObject *left,
6503 PyObject *right)
6504{
6505 PyUnicodeObject *u = NULL, *v = NULL, *w;
6506
6507 /* Coerce the two arguments */
6508 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6509 if (u == NULL)
6510 goto onError;
6511 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6512 if (v == NULL)
6513 goto onError;
6514
6515 /* Shortcuts */
6516 if (v == unicode_empty) {
6517 Py_DECREF(v);
6518 return (PyObject *)u;
6519 }
6520 if (u == unicode_empty) {
6521 Py_DECREF(u);
6522 return (PyObject *)v;
6523 }
6524
6525 /* Concat the two Unicode strings */
6526 w = _PyUnicode_New(u->length + v->length);
6527 if (w == NULL)
6528 goto onError;
6529 Py_UNICODE_COPY(w->str, u->str, u->length);
6530 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6531
6532 Py_DECREF(u);
6533 Py_DECREF(v);
6534 return (PyObject *)w;
6535
6536onError:
6537 Py_XDECREF(u);
6538 Py_XDECREF(v);
6539 return NULL;
6540}
6541
Walter Dörwald1ab83302007-05-18 17:15:44 +00006542void
6543PyUnicode_Append(PyObject **pleft, PyObject *right)
6544{
6545 PyObject *new;
6546 if (*pleft == NULL)
6547 return;
6548 if (right == NULL || !PyUnicode_Check(*pleft)) {
6549 Py_DECREF(*pleft);
6550 *pleft = NULL;
6551 return;
6552 }
6553 new = PyUnicode_Concat(*pleft, right);
6554 Py_DECREF(*pleft);
6555 *pleft = new;
6556}
6557
6558void
6559PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6560{
6561 PyUnicode_Append(pleft, right);
6562 Py_XDECREF(right);
6563}
6564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566"S.count(sub[, start[, end]]) -> int\n\
6567\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006568Return the number of non-overlapping occurrences of substring sub in\n\
6569Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
6572static PyObject *
6573unicode_count(PyUnicodeObject *self, PyObject *args)
6574{
6575 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006577 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 PyObject *result;
6579
Guido van Rossumb8872e62000-05-09 14:14:27 +00006580 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6581 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return NULL;
6583
6584 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006585 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 if (substring == NULL)
6587 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006588
Thomas Wouters477c8d52006-05-27 19:21:47 +00006589 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Christian Heimes217cfd12007-12-02 14:31:20 +00006591 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 stringlib_count(self->str + start, end - start,
6593 substring->str, substring->length)
6594 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return result;
6599}
6600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006601PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006602"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006604Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006605to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006606handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6608'xmlcharrefreplace' as well as any other name registered with\n\
6609codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611static PyObject *
6612unicode_encode(PyUnicodeObject *self, PyObject *args)
6613{
6614 char *encoding = NULL;
6615 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006616 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6619 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006620 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006621 if (v == NULL)
6622 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006623 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006624 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006625 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006626 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006627 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006628 Py_DECREF(v);
6629 return NULL;
6630 }
6631 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006632
6633 onError:
6634 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006635}
6636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006638"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639\n\
6640Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006641If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
6643static PyObject*
6644unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6645{
6646 Py_UNICODE *e;
6647 Py_UNICODE *p;
6648 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006649 Py_UNICODE *qe;
6650 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 PyUnicodeObject *u;
6652 int tabsize = 8;
6653
6654 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6655 return NULL;
6656
Thomas Wouters7e474022000-07-16 12:04:32 +00006657 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006658 i = 0; /* chars up to and including most recent \n or \r */
6659 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6660 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 for (p = self->str; p < e; p++)
6662 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006663 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006664 incr = tabsize - (j % tabsize); /* cannot overflow */
6665 if (j > PY_SSIZE_T_MAX - incr)
6666 goto overflow1;
6667 j += incr;
6668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 }
6670 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006671 if (j > PY_SSIZE_T_MAX - 1)
6672 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 j++;
6674 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006675 if (i > PY_SSIZE_T_MAX - j)
6676 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006678 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 }
6680 }
6681
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006682 if (i > PY_SSIZE_T_MAX - j)
6683 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 /* Second pass: create output string and fill it */
6686 u = _PyUnicode_New(i + j);
6687 if (!u)
6688 return NULL;
6689
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006690 j = 0; /* same as in first pass */
6691 q = u->str; /* next output char */
6692 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694 for (p = self->str; p < e; p++)
6695 if (*p == '\t') {
6696 if (tabsize > 0) {
6697 i = tabsize - (j % tabsize);
6698 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006699 while (i--) {
6700 if (q >= qe)
6701 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 }
6705 }
6706 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006707 if (q >= qe)
6708 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006710 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 if (*p == '\n' || *p == '\r')
6712 j = 0;
6713 }
6714
6715 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006716
6717 overflow2:
6718 Py_DECREF(u);
6719 overflow1:
6720 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6721 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722}
6723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006724PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006725"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726\n\
6727Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006728such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729arguments start and end are interpreted as in slice notation.\n\
6730\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733static PyObject *
6734unicode_find(PyUnicodeObject *self, PyObject *args)
6735{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006736 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006737 Py_ssize_t start;
6738 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006739 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
Christian Heimes9cd17752007-11-18 19:35:23 +00006741 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743
Thomas Wouters477c8d52006-05-27 19:21:47 +00006744 result = stringlib_find_slice(
6745 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6746 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6747 start, end
6748 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
6750 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006751
Christian Heimes217cfd12007-12-02 14:31:20 +00006752 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753}
6754
6755static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
6758 if (index < 0 || index >= self->length) {
6759 PyErr_SetString(PyExc_IndexError, "string index out of range");
6760 return NULL;
6761 }
6762
6763 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6764}
6765
Guido van Rossumc2504932007-09-18 19:42:40 +00006766/* Believe it or not, this produces the same value for ASCII strings
6767 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006769unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Guido van Rossumc2504932007-09-18 19:42:40 +00006771 Py_ssize_t len;
6772 Py_UNICODE *p;
6773 long x;
6774
6775 if (self->hash != -1)
6776 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006777 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006778 p = self->str;
6779 x = *p << 7;
6780 while (--len >= 0)
6781 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006782 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006783 if (x == -1)
6784 x = -2;
6785 self->hash = x;
6786 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787}
6788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006789PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006790"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006792Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
6794static PyObject *
6795unicode_index(PyUnicodeObject *self, PyObject *args)
6796{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006797 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006798 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006799 Py_ssize_t start;
6800 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
Christian Heimes9cd17752007-11-18 19:35:23 +00006802 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
Thomas Wouters477c8d52006-05-27 19:21:47 +00006805 result = stringlib_find_slice(
6806 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6807 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6808 start, end
6809 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 if (result < 0) {
6814 PyErr_SetString(PyExc_ValueError, "substring not found");
6815 return NULL;
6816 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006817
Christian Heimes217cfd12007-12-02 14:31:20 +00006818 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819}
6820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006821PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
6827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006828unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829{
6830 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6831 register const Py_UNICODE *e;
6832 int cased;
6833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 /* Shortcut for single character strings */
6835 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006838 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006839 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 e = p + PyUnicode_GET_SIZE(self);
6843 cased = 0;
6844 for (; p < e; p++) {
6845 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 else if (!cased && Py_UNICODE_ISLOWER(ch))
6850 cased = 1;
6851 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006858Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860
6861static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006862unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863{
6864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6865 register const Py_UNICODE *e;
6866 int cased;
6867
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 /* Shortcut for single character strings */
6869 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006872 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006873 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 e = p + PyUnicode_GET_SIZE(self);
6877 cased = 0;
6878 for (; p < e; p++) {
6879 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 else if (!cased && Py_UNICODE_ISUPPER(ch))
6884 cased = 1;
6885 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887}
6888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006889PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006892Return True if S is a titlecased string and there is at least one\n\
6893character in S, i.e. upper- and titlecase characters may only\n\
6894follow uncased characters and lowercase characters only cased ones.\n\
6895Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006898unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6901 register const Py_UNICODE *e;
6902 int cased, previous_is_cased;
6903
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 /* Shortcut for single character strings */
6905 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6907 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006909 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006910 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 e = p + PyUnicode_GET_SIZE(self);
6914 cased = 0;
6915 previous_is_cased = 0;
6916 for (; p < e; p++) {
6917 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006918
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6920 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 previous_is_cased = 1;
6923 cased = 1;
6924 }
6925 else if (Py_UNICODE_ISLOWER(ch)) {
6926 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 previous_is_cased = 1;
6929 cased = 1;
6930 }
6931 else
6932 previous_is_cased = 0;
6933 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935}
6936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006940Return True if all characters in S are whitespace\n\
6941and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
6943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006944unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945{
6946 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6947 register const Py_UNICODE *e;
6948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 /* Shortcut for single character strings */
6950 if (PyUnicode_GET_SIZE(self) == 1 &&
6951 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006954 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006955 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006957
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 e = p + PyUnicode_GET_SIZE(self);
6959 for (; p < e; p++) {
6960 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964}
6965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006968\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006969Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006971
6972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006973unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006974{
6975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6976 register const Py_UNICODE *e;
6977
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006978 /* Shortcut for single character strings */
6979 if (PyUnicode_GET_SIZE(self) == 1 &&
6980 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006981 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006982
6983 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006984 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006986
6987 e = p + PyUnicode_GET_SIZE(self);
6988 for (; p < e; p++) {
6989 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006993}
6994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006995PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006997\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006998Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006999and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007000
7001static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007002unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007003{
7004 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7005 register const Py_UNICODE *e;
7006
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007007 /* Shortcut for single character strings */
7008 if (PyUnicode_GET_SIZE(self) == 1 &&
7009 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007010 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007011
7012 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007013 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007015
7016 e = p + PyUnicode_GET_SIZE(self);
7017 for (; p < e; p++) {
7018 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007020 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007021 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007022}
7023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007027Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007028False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029
7030static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007031unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032{
7033 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7034 register const Py_UNICODE *e;
7035
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 /* Shortcut for single character strings */
7037 if (PyUnicode_GET_SIZE(self) == 1 &&
7038 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007039 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007041 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007042 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007043 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007044
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 e = p + PyUnicode_GET_SIZE(self);
7046 for (; p < e; p++) {
7047 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007048 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007050 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051}
7052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007053PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007056Return True if all characters in S are digits\n\
7057and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058
7059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007060unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
7062 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7063 register const Py_UNICODE *e;
7064
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 /* Shortcut for single character strings */
7066 if (PyUnicode_GET_SIZE(self) == 1 &&
7067 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007068 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007070 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007071 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007073
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 e = p + PyUnicode_GET_SIZE(self);
7075 for (; p < e; p++) {
7076 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007077 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007079 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007082PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007083"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007085Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
7088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007089unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090{
7091 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7092 register const Py_UNICODE *e;
7093
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 /* Shortcut for single character strings */
7095 if (PyUnicode_GET_SIZE(self) == 1 &&
7096 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007097 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007099 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007100 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007101 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007102
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 e = p + PyUnicode_GET_SIZE(self);
7104 for (; p < e; p++) {
7105 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007106 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109}
7110
Martin v. Löwis47383402007-08-15 07:32:56 +00007111int
7112PyUnicode_IsIdentifier(PyObject *self)
7113{
7114 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7115 register const Py_UNICODE *e;
7116
7117 /* Special case for empty strings */
7118 if (PyUnicode_GET_SIZE(self) == 0)
7119 return 0;
7120
7121 /* PEP 3131 says that the first character must be in
7122 XID_Start and subsequent characters in XID_Continue,
7123 and for the ASCII range, the 2.x rules apply (i.e
7124 start with letters and underscore, continue with
7125 letters, digits, underscore). However, given the current
7126 definition of XID_Start and XID_Continue, it is sufficient
7127 to check just for these, except that _ must be allowed
7128 as starting an identifier. */
7129 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7130 return 0;
7131
7132 e = p + PyUnicode_GET_SIZE(self);
7133 for (p++; p < e; p++) {
7134 if (!_PyUnicode_IsXidContinue(*p))
7135 return 0;
7136 }
7137 return 1;
7138}
7139
7140PyDoc_STRVAR(isidentifier__doc__,
7141"S.isidentifier() -> bool\n\
7142\n\
7143Return True if S is a valid identifier according\n\
7144to the language definition.");
7145
7146static PyObject*
7147unicode_isidentifier(PyObject *self)
7148{
7149 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7150}
7151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007152PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007153"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154\n\
7155Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007159unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007161 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165unicode_length(PyUnicodeObject *self)
7166{
7167 return self->length;
7168}
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007171"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172\n\
7173Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007174done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175
7176static PyObject *
7177unicode_ljust(PyUnicodeObject *self, PyObject *args)
7178{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007179 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007180 Py_UNICODE fillchar = ' ';
7181
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007182 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 return NULL;
7184
Tim Peters7a29bd52001-09-12 03:03:31 +00007185 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 Py_INCREF(self);
7187 return (PyObject*) self;
7188 }
7189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007190 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191}
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007194"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007196Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007199unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return fixup(self, fixlower);
7202}
7203
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204#define LEFTSTRIP 0
7205#define RIGHTSTRIP 1
7206#define BOTHSTRIP 2
7207
7208/* Arrays indexed by above */
7209static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7210
7211#define STRIPNAME(i) (stripformat[i]+3)
7212
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213/* externally visible for str.strip(unicode) */
7214PyObject *
7215_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7216{
7217 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007218 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007219 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7221 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222
Thomas Wouters477c8d52006-05-27 19:21:47 +00007223 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7224
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007225 i = 0;
7226 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007227 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7228 i++;
7229 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007230 }
7231
7232 j = len;
7233 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007234 do {
7235 j--;
7236 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7237 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238 }
7239
7240 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007241 Py_INCREF(self);
7242 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007243 }
7244 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007245 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246}
7247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007252 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007254
7255 i = 0;
7256 if (striptype != RIGHTSTRIP) {
7257 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7258 i++;
7259 }
7260 }
7261
7262 j = len;
7263 if (striptype != LEFTSTRIP) {
7264 do {
7265 j--;
7266 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7267 j++;
7268 }
7269
7270 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7271 Py_INCREF(self);
7272 return (PyObject*)self;
7273 }
7274 else
7275 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276}
7277
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007278
7279static PyObject *
7280do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7281{
7282 PyObject *sep = NULL;
7283
7284 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7285 return NULL;
7286
7287 if (sep != NULL && sep != Py_None) {
7288 if (PyUnicode_Check(sep))
7289 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007290 else {
7291 PyErr_Format(PyExc_TypeError,
7292 "%s arg must be None, unicode or str",
7293 STRIPNAME(striptype));
7294 return NULL;
7295 }
7296 }
7297
7298 return do_strip(self, striptype);
7299}
7300
7301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007302PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007303"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007304\n\
7305Return a copy of the string S with leading and trailing\n\
7306whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007307If chars is given and not None, remove characters in chars instead.\n\
7308If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007309
7310static PyObject *
7311unicode_strip(PyUnicodeObject *self, PyObject *args)
7312{
7313 if (PyTuple_GET_SIZE(args) == 0)
7314 return do_strip(self, BOTHSTRIP); /* Common case */
7315 else
7316 return do_argstrip(self, BOTHSTRIP, args);
7317}
7318
7319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007321"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007322\n\
7323Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007324If chars is given and not None, remove characters in chars instead.\n\
7325If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007326
7327static PyObject *
7328unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7329{
7330 if (PyTuple_GET_SIZE(args) == 0)
7331 return do_strip(self, LEFTSTRIP); /* Common case */
7332 else
7333 return do_argstrip(self, LEFTSTRIP, args);
7334}
7335
7336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007338"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007339\n\
7340Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007341If chars is given and not None, remove characters in chars instead.\n\
7342If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343
7344static PyObject *
7345unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7346{
7347 if (PyTuple_GET_SIZE(args) == 0)
7348 return do_strip(self, RIGHTSTRIP); /* Common case */
7349 else
7350 return do_argstrip(self, RIGHTSTRIP, args);
7351}
7352
7353
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007355unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356{
7357 PyUnicodeObject *u;
7358 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007360 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362 if (len < 0)
7363 len = 0;
7364
Tim Peters7a29bd52001-09-12 03:03:31 +00007365 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 /* no repeat, return original string */
7367 Py_INCREF(str);
7368 return (PyObject*) str;
7369 }
Tim Peters8f422462000-09-09 06:13:41 +00007370
7371 /* ensure # of chars needed doesn't overflow int and # of bytes
7372 * needed doesn't overflow size_t
7373 */
7374 nchars = len * str->length;
7375 if (len && nchars / len != str->length) {
7376 PyErr_SetString(PyExc_OverflowError,
7377 "repeated string is too long");
7378 return NULL;
7379 }
7380 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7381 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7382 PyErr_SetString(PyExc_OverflowError,
7383 "repeated string is too long");
7384 return NULL;
7385 }
7386 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 if (!u)
7388 return NULL;
7389
7390 p = u->str;
7391
Thomas Wouters477c8d52006-05-27 19:21:47 +00007392 if (str->length == 1 && len > 0) {
7393 Py_UNICODE_FILL(p, str->str[0], len);
7394 } else {
7395 Py_ssize_t done = 0; /* number of characters copied this far */
7396 if (done < nchars) {
7397 Py_UNICODE_COPY(p, str->str, str->length);
7398 done = str->length;
7399 }
7400 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007401 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007402 Py_UNICODE_COPY(p+done, p, n);
7403 done += n;
7404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 }
7406
7407 return (PyObject*) u;
7408}
7409
7410PyObject *PyUnicode_Replace(PyObject *obj,
7411 PyObject *subobj,
7412 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007413 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414{
7415 PyObject *self;
7416 PyObject *str1;
7417 PyObject *str2;
7418 PyObject *result;
7419
7420 self = PyUnicode_FromObject(obj);
7421 if (self == NULL)
7422 return NULL;
7423 str1 = PyUnicode_FromObject(subobj);
7424 if (str1 == NULL) {
7425 Py_DECREF(self);
7426 return NULL;
7427 }
7428 str2 = PyUnicode_FromObject(replobj);
7429 if (str2 == NULL) {
7430 Py_DECREF(self);
7431 Py_DECREF(str1);
7432 return NULL;
7433 }
Tim Petersced69f82003-09-16 20:30:58 +00007434 result = replace((PyUnicodeObject *)self,
7435 (PyUnicodeObject *)str1,
7436 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 maxcount);
7438 Py_DECREF(self);
7439 Py_DECREF(str1);
7440 Py_DECREF(str2);
7441 return result;
7442}
7443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007444PyDoc_STRVAR(replace__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007445"S.replace (old, new[, maxsplit]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446\n\
7447Return a copy of S with all occurrences of substring\n\
7448old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007449given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
7451static PyObject*
7452unicode_replace(PyUnicodeObject *self, PyObject *args)
7453{
7454 PyUnicodeObject *str1;
7455 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007456 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 PyObject *result;
7458
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 return NULL;
7461 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7462 if (str1 == NULL)
7463 return NULL;
7464 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007465 if (str2 == NULL) {
7466 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
7470 result = replace(self, str1, str2, maxcount);
7471
7472 Py_DECREF(str1);
7473 Py_DECREF(str2);
7474 return result;
7475}
7476
7477static
7478PyObject *unicode_repr(PyObject *unicode)
7479{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007480 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007481 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007482 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7483 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7484
7485 /* XXX(nnorwitz): rather than over-allocating, it would be
7486 better to choose a different scheme. Perhaps scan the
7487 first N-chars of the string and allocate based on that size.
7488 */
7489 /* Initial allocation is based on the longest-possible unichr
7490 escape.
7491
7492 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7493 unichr, so in this case it's the longest unichr escape. In
7494 narrow (UTF-16) builds this is five chars per source unichr
7495 since there are two unichrs in the surrogate pair, so in narrow
7496 (UTF-16) builds it's not the longest unichr escape.
7497
7498 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7499 so in the narrow (UTF-16) build case it's the longest unichr
7500 escape.
7501 */
7502
Walter Dörwald1ab83302007-05-18 17:15:44 +00007503 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007504 2 /* quotes */
7505#ifdef Py_UNICODE_WIDE
7506 + 10*size
7507#else
7508 + 6*size
7509#endif
7510 + 1);
7511 if (repr == NULL)
7512 return NULL;
7513
Walter Dörwald1ab83302007-05-18 17:15:44 +00007514 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007515
7516 /* Add quote */
7517 *p++ = (findchar(s, size, '\'') &&
7518 !findchar(s, size, '"')) ? '"' : '\'';
7519 while (size-- > 0) {
7520 Py_UNICODE ch = *s++;
7521
7522 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007523 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007524 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007525 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007526 continue;
7527 }
7528
7529#ifdef Py_UNICODE_WIDE
7530 /* Map 21-bit characters to '\U00xxxxxx' */
7531 else if (ch >= 0x10000) {
7532 *p++ = '\\';
7533 *p++ = 'U';
7534 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7535 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7536 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7537 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7538 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7539 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7540 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7541 *p++ = hexdigits[ch & 0x0000000F];
7542 continue;
7543 }
7544#else
7545 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7546 else if (ch >= 0xD800 && ch < 0xDC00) {
7547 Py_UNICODE ch2;
7548 Py_UCS4 ucs;
7549
7550 ch2 = *s++;
7551 size--;
7552 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7553 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7554 *p++ = '\\';
7555 *p++ = 'U';
7556 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7557 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7558 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7559 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7560 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7561 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7562 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7563 *p++ = hexdigits[ucs & 0x0000000F];
7564 continue;
7565 }
7566 /* Fall through: isolated surrogates are copied as-is */
7567 s--;
7568 size++;
7569 }
7570#endif
7571
7572 /* Map 16-bit characters to '\uxxxx' */
7573 if (ch >= 256) {
7574 *p++ = '\\';
7575 *p++ = 'u';
7576 *p++ = hexdigits[(ch >> 12) & 0x000F];
7577 *p++ = hexdigits[(ch >> 8) & 0x000F];
7578 *p++ = hexdigits[(ch >> 4) & 0x000F];
7579 *p++ = hexdigits[ch & 0x000F];
7580 }
7581
7582 /* Map special whitespace to '\t', \n', '\r' */
7583 else if (ch == '\t') {
7584 *p++ = '\\';
7585 *p++ = 't';
7586 }
7587 else if (ch == '\n') {
7588 *p++ = '\\';
7589 *p++ = 'n';
7590 }
7591 else if (ch == '\r') {
7592 *p++ = '\\';
7593 *p++ = 'r';
7594 }
7595
7596 /* Map non-printable US ASCII to '\xhh' */
7597 else if (ch < ' ' || ch >= 0x7F) {
7598 *p++ = '\\';
7599 *p++ = 'x';
7600 *p++ = hexdigits[(ch >> 4) & 0x000F];
7601 *p++ = hexdigits[ch & 0x000F];
7602 }
7603
7604 /* Copy everything else as-is */
7605 else
7606 *p++ = (char) ch;
7607 }
7608 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007609 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007610
7611 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007612 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007613 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007617"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
7619Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007620such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621arguments start and end are interpreted as in slice notation.\n\
7622\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007623Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
7625static PyObject *
7626unicode_rfind(PyUnicodeObject *self, PyObject *args)
7627{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007629 Py_ssize_t start;
7630 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007631 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Christian Heimes9cd17752007-11-18 19:35:23 +00007633 if (!_ParseTupleFinds(args, &substring, &start, &end))
7634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635
Thomas Wouters477c8d52006-05-27 19:21:47 +00007636 result = stringlib_rfind_slice(
7637 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7638 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7639 start, end
7640 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007643
Christian Heimes217cfd12007-12-02 14:31:20 +00007644 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007648"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
7652static PyObject *
7653unicode_rindex(PyUnicodeObject *self, PyObject *args)
7654{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007655 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007656 Py_ssize_t start;
7657 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007658 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659
Christian Heimes9cd17752007-11-18 19:35:23 +00007660 if (!_ParseTupleFinds(args, &substring, &start, &end))
7661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
Thomas Wouters477c8d52006-05-27 19:21:47 +00007663 result = stringlib_rfind_slice(
7664 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7665 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7666 start, end
7667 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668
7669 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007670
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 if (result < 0) {
7672 PyErr_SetString(PyExc_ValueError, "substring not found");
7673 return NULL;
7674 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007675 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676}
7677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007679"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680\n\
7681Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007682done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
7684static PyObject *
7685unicode_rjust(PyUnicodeObject *self, PyObject *args)
7686{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007687 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007688 Py_UNICODE fillchar = ' ';
7689
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007690 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 return NULL;
7692
Tim Peters7a29bd52001-09-12 03:03:31 +00007693 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 Py_INCREF(self);
7695 return (PyObject*) self;
7696 }
7697
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007698 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699}
7700
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701PyObject *PyUnicode_Split(PyObject *s,
7702 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007703 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704{
7705 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007706
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 s = PyUnicode_FromObject(s);
7708 if (s == NULL)
7709 return NULL;
7710 if (sep != NULL) {
7711 sep = PyUnicode_FromObject(sep);
7712 if (sep == NULL) {
7713 Py_DECREF(s);
7714 return NULL;
7715 }
7716 }
7717
7718 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7719
7720 Py_DECREF(s);
7721 Py_XDECREF(sep);
7722 return result;
7723}
7724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007726"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727\n\
7728Return a list of the words in S, using sep as the\n\
7729delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007730splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007731whitespace string is a separator and empty strings are\n\
7732removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
7734static PyObject*
7735unicode_split(PyUnicodeObject *self, PyObject *args)
7736{
7737 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739
Martin v. Löwis18e16552006-02-15 17:27:45 +00007740 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 return NULL;
7742
7743 if (substring == Py_None)
7744 return split(self, NULL, maxcount);
7745 else if (PyUnicode_Check(substring))
7746 return split(self, (PyUnicodeObject *)substring, maxcount);
7747 else
7748 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7749}
7750
Thomas Wouters477c8d52006-05-27 19:21:47 +00007751PyObject *
7752PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7753{
7754 PyObject* str_obj;
7755 PyObject* sep_obj;
7756 PyObject* out;
7757
7758 str_obj = PyUnicode_FromObject(str_in);
7759 if (!str_obj)
7760 return NULL;
7761 sep_obj = PyUnicode_FromObject(sep_in);
7762 if (!sep_obj) {
7763 Py_DECREF(str_obj);
7764 return NULL;
7765 }
7766
7767 out = stringlib_partition(
7768 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7769 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7770 );
7771
7772 Py_DECREF(sep_obj);
7773 Py_DECREF(str_obj);
7774
7775 return out;
7776}
7777
7778
7779PyObject *
7780PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7781{
7782 PyObject* str_obj;
7783 PyObject* sep_obj;
7784 PyObject* out;
7785
7786 str_obj = PyUnicode_FromObject(str_in);
7787 if (!str_obj)
7788 return NULL;
7789 sep_obj = PyUnicode_FromObject(sep_in);
7790 if (!sep_obj) {
7791 Py_DECREF(str_obj);
7792 return NULL;
7793 }
7794
7795 out = stringlib_rpartition(
7796 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7797 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7798 );
7799
7800 Py_DECREF(sep_obj);
7801 Py_DECREF(str_obj);
7802
7803 return out;
7804}
7805
7806PyDoc_STRVAR(partition__doc__,
7807"S.partition(sep) -> (head, sep, tail)\n\
7808\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007809Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007810the separator itself, and the part after it. If the separator is not\n\
7811found, returns S and two empty strings.");
7812
7813static PyObject*
7814unicode_partition(PyUnicodeObject *self, PyObject *separator)
7815{
7816 return PyUnicode_Partition((PyObject *)self, separator);
7817}
7818
7819PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007820"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007821\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007822Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007823the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007824separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007825
7826static PyObject*
7827unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7828{
7829 return PyUnicode_RPartition((PyObject *)self, separator);
7830}
7831
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007832PyObject *PyUnicode_RSplit(PyObject *s,
7833 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007835{
7836 PyObject *result;
7837
7838 s = PyUnicode_FromObject(s);
7839 if (s == NULL)
7840 return NULL;
7841 if (sep != NULL) {
7842 sep = PyUnicode_FromObject(sep);
7843 if (sep == NULL) {
7844 Py_DECREF(s);
7845 return NULL;
7846 }
7847 }
7848
7849 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7850
7851 Py_DECREF(s);
7852 Py_XDECREF(sep);
7853 return result;
7854}
7855
7856PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007857"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007858\n\
7859Return a list of the words in S, using sep as the\n\
7860delimiter string, starting at the end of the string and\n\
7861working to the front. If maxsplit is given, at most maxsplit\n\
7862splits are done. If sep is not specified, any whitespace string\n\
7863is a separator.");
7864
7865static PyObject*
7866unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7867{
7868 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007870
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007872 return NULL;
7873
7874 if (substring == Py_None)
7875 return rsplit(self, NULL, maxcount);
7876 else if (PyUnicode_Check(substring))
7877 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7878 else
7879 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7880}
7881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007882PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007883"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884\n\
7885Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007886Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888
7889static PyObject*
7890unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7891{
Guido van Rossum86662912000-04-11 15:38:46 +00007892 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893
Guido van Rossum86662912000-04-11 15:38:46 +00007894 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 return NULL;
7896
Guido van Rossum86662912000-04-11 15:38:46 +00007897 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898}
7899
7900static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007901PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902{
Walter Dörwald346737f2007-05-31 10:44:43 +00007903 if (PyUnicode_CheckExact(self)) {
7904 Py_INCREF(self);
7905 return self;
7906 } else
7907 /* Subtype -- return genuine unicode string with the same value. */
7908 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7909 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910}
7911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007912PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007913"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914\n\
7915Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007916and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921 return fixup(self, fixswapcase);
7922}
7923
Georg Brandlceee0772007-11-27 23:48:05 +00007924PyDoc_STRVAR(maketrans__doc__,
7925"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7926\n\
7927Return a translation table usable for str.translate().\n\
7928If there is only one argument, it must be a dictionary mapping Unicode\n\
7929ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7930Character keys will then be converted to ordinals.\n\
7931If there are two arguments, they must be strings of equal length, and\n\
7932in the resulting dictionary, each character in x will be mapped to the\n\
7933character at the same position in y. If there is a third argument, it\n\
7934must be a string, whose characters will be mapped to None in the result.");
7935
7936static PyObject*
7937unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7938{
7939 PyObject *x, *y = NULL, *z = NULL;
7940 PyObject *new = NULL, *key, *value;
7941 Py_ssize_t i = 0;
7942 int res;
7943
7944 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7945 return NULL;
7946 new = PyDict_New();
7947 if (!new)
7948 return NULL;
7949 if (y != NULL) {
7950 /* x must be a string too, of equal length */
7951 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7952 if (!PyUnicode_Check(x)) {
7953 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7954 "be a string if there is a second argument");
7955 goto err;
7956 }
7957 if (PyUnicode_GET_SIZE(x) != ylen) {
7958 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7959 "arguments must have equal length");
7960 goto err;
7961 }
7962 /* create entries for translating chars in x to those in y */
7963 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007964 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7965 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007966 if (!key || !value)
7967 goto err;
7968 res = PyDict_SetItem(new, key, value);
7969 Py_DECREF(key);
7970 Py_DECREF(value);
7971 if (res < 0)
7972 goto err;
7973 }
7974 /* create entries for deleting chars in z */
7975 if (z != NULL) {
7976 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007977 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007978 if (!key)
7979 goto err;
7980 res = PyDict_SetItem(new, key, Py_None);
7981 Py_DECREF(key);
7982 if (res < 0)
7983 goto err;
7984 }
7985 }
7986 } else {
7987 /* x must be a dict */
7988 if (!PyDict_Check(x)) {
7989 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7990 "to maketrans it must be a dict");
7991 goto err;
7992 }
7993 /* copy entries into the new dict, converting string keys to int keys */
7994 while (PyDict_Next(x, &i, &key, &value)) {
7995 if (PyUnicode_Check(key)) {
7996 /* convert string keys to integer keys */
7997 PyObject *newkey;
7998 if (PyUnicode_GET_SIZE(key) != 1) {
7999 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8000 "table must be of length 1");
8001 goto err;
8002 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008003 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008004 if (!newkey)
8005 goto err;
8006 res = PyDict_SetItem(new, newkey, value);
8007 Py_DECREF(newkey);
8008 if (res < 0)
8009 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008010 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008011 /* just keep integer keys */
8012 if (PyDict_SetItem(new, key, value) < 0)
8013 goto err;
8014 } else {
8015 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8016 "be strings or integers");
8017 goto err;
8018 }
8019 }
8020 }
8021 return new;
8022 err:
8023 Py_DECREF(new);
8024 return NULL;
8025}
8026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008027PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008028"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029\n\
8030Return a copy of the string S, where all characters have been mapped\n\
8031through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008032Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8033Unmapped characters are left untouched. Characters mapped to None\n\
8034are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
8036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008037unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Georg Brandlceee0772007-11-27 23:48:05 +00008039 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008042PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008043"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008045Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
8047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008048unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return fixup(self, fixupper);
8051}
8052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008053PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008054"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055\n\
8056Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008057of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058
8059static PyObject *
8060unicode_zfill(PyUnicodeObject *self, PyObject *args)
8061{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 PyUnicodeObject *u;
8064
Martin v. Löwis18e16552006-02-15 17:27:45 +00008065 Py_ssize_t width;
8066 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 return NULL;
8068
8069 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008070 if (PyUnicode_CheckExact(self)) {
8071 Py_INCREF(self);
8072 return (PyObject*) self;
8073 }
8074 else
8075 return PyUnicode_FromUnicode(
8076 PyUnicode_AS_UNICODE(self),
8077 PyUnicode_GET_SIZE(self)
8078 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 }
8080
8081 fill = width - self->length;
8082
8083 u = pad(self, fill, 0, '0');
8084
Walter Dörwald068325e2002-04-15 13:36:47 +00008085 if (u == NULL)
8086 return NULL;
8087
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 if (u->str[fill] == '+' || u->str[fill] == '-') {
8089 /* move sign to beginning of string */
8090 u->str[0] = u->str[fill];
8091 u->str[fill] = '0';
8092 }
8093
8094 return (PyObject*) u;
8095}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096
8097#if 0
8098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008099unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100{
Christian Heimes2202f872008-02-06 14:31:34 +00008101 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103#endif
8104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008105PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008106"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008108Return True if S starts with the specified prefix, False otherwise.\n\
8109With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008110With optional end, stop comparing S at that position.\n\
8111prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
8113static PyObject *
8114unicode_startswith(PyUnicodeObject *self,
8115 PyObject *args)
8116{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008117 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008119 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008120 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008121 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008123 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008124 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008126 if (PyTuple_Check(subobj)) {
8127 Py_ssize_t i;
8128 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8129 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8130 PyTuple_GET_ITEM(subobj, i));
8131 if (substring == NULL)
8132 return NULL;
8133 result = tailmatch(self, substring, start, end, -1);
8134 Py_DECREF(substring);
8135 if (result) {
8136 Py_RETURN_TRUE;
8137 }
8138 }
8139 /* nothing matched */
8140 Py_RETURN_FALSE;
8141 }
8142 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008144 return NULL;
8145 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008147 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148}
8149
8150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008151PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008152"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008154Return True if S ends with the specified suffix, False otherwise.\n\
8155With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008156With optional end, stop comparing S at that position.\n\
8157suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158
8159static PyObject *
8160unicode_endswith(PyUnicodeObject *self,
8161 PyObject *args)
8162{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008163 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008165 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008166 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008167 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008169 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8170 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008172 if (PyTuple_Check(subobj)) {
8173 Py_ssize_t i;
8174 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8175 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8176 PyTuple_GET_ITEM(subobj, i));
8177 if (substring == NULL)
8178 return NULL;
8179 result = tailmatch(self, substring, start, end, +1);
8180 Py_DECREF(substring);
8181 if (result) {
8182 Py_RETURN_TRUE;
8183 }
8184 }
8185 Py_RETURN_FALSE;
8186 }
8187 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008191 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008193 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194}
8195
Eric Smith8c663262007-08-25 02:26:07 +00008196#include "stringlib/string_format.h"
8197
8198PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008199"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008200\n\
8201");
8202
Eric Smith4a7d76d2008-05-30 18:10:19 +00008203static PyObject *
8204unicode__format__(PyObject* self, PyObject* args)
8205{
8206 PyObject *format_spec;
8207
8208 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8209 return NULL;
8210
8211 return _PyUnicode_FormatAdvanced(self,
8212 PyUnicode_AS_UNICODE(format_spec),
8213 PyUnicode_GET_SIZE(format_spec));
8214}
8215
Eric Smith8c663262007-08-25 02:26:07 +00008216PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008217"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008218\n\
8219");
8220
8221static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008222unicode_getnewargs(PyUnicodeObject *v)
8223{
8224 return Py_BuildValue("(u#)", v->str, v->length);
8225}
8226
8227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228static PyMethodDef unicode_methods[] = {
8229
8230 /* Order is according to common usage: often used methods should
8231 appear first, since lookup is done sequentially. */
8232
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008233 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8234 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8235 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008236 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008237 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8238 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8239 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8240 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8241 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8242 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8243 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008245 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8246 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8247 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008248 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008249 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8250 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8251 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008252 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008254 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008255 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008256 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8257 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8258 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8259 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8260 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8261 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8262 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8263 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8264 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8265 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8266 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8267 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8268 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8269 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008270 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008271 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008272 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008273 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008274 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8275 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008276 {"maketrans", (PyCFunction) unicode_maketrans,
8277 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008278#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008279 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280#endif
8281
8282#if 0
8283 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008284 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285#endif
8286
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008287 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 {NULL, NULL}
8289};
8290
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008291static PyObject *
8292unicode_mod(PyObject *v, PyObject *w)
8293{
8294 if (!PyUnicode_Check(v)) {
8295 Py_INCREF(Py_NotImplemented);
8296 return Py_NotImplemented;
8297 }
8298 return PyUnicode_Format(v, w);
8299}
8300
8301static PyNumberMethods unicode_as_number = {
8302 0, /*nb_add*/
8303 0, /*nb_subtract*/
8304 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008305 unicode_mod, /*nb_remainder*/
8306};
8307
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008309 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008310 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008311 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8312 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008313 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 0, /* sq_ass_item */
8315 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008316 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317};
8318
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008319static PyObject*
8320unicode_subscript(PyUnicodeObject* self, PyObject* item)
8321{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008322 if (PyIndex_Check(item)) {
8323 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008324 if (i == -1 && PyErr_Occurred())
8325 return NULL;
8326 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008327 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008328 return unicode_getitem(self, i);
8329 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008330 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008331 Py_UNICODE* source_buf;
8332 Py_UNICODE* result_buf;
8333 PyObject* result;
8334
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008335 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008336 &start, &stop, &step, &slicelength) < 0) {
8337 return NULL;
8338 }
8339
8340 if (slicelength <= 0) {
8341 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008342 } else if (start == 0 && step == 1 && slicelength == self->length &&
8343 PyUnicode_CheckExact(self)) {
8344 Py_INCREF(self);
8345 return (PyObject *)self;
8346 } else if (step == 1) {
8347 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008348 } else {
8349 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008350 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8351 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008352
8353 if (result_buf == NULL)
8354 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008355
8356 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8357 result_buf[i] = source_buf[cur];
8358 }
Tim Petersced69f82003-09-16 20:30:58 +00008359
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008360 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008361 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008362 return result;
8363 }
8364 } else {
8365 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8366 return NULL;
8367 }
8368}
8369
8370static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008371 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008372 (binaryfunc)unicode_subscript, /* mp_subscript */
8373 (objobjargproc)0, /* mp_ass_subscript */
8374};
8375
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377/* Helpers for PyUnicode_Format() */
8378
8379static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 if (argidx < arglen) {
8384 (*p_argidx)++;
8385 if (arglen < 0)
8386 return args;
8387 else
8388 return PyTuple_GetItem(args, argidx);
8389 }
8390 PyErr_SetString(PyExc_TypeError,
8391 "not enough arguments for format string");
8392 return NULL;
8393}
8394
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008396strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 register Py_ssize_t i;
8399 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 for (i = len - 1; i >= 0; i--)
8401 buffer[i] = (Py_UNICODE) charbuffer[i];
8402
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 return len;
8404}
8405
Neal Norwitzfc76d632006-01-10 06:03:13 +00008406static int
8407doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8408{
Tim Peters15231542006-02-16 01:08:01 +00008409 Py_ssize_t result;
8410
Neal Norwitzfc76d632006-01-10 06:03:13 +00008411 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008412 result = strtounicode(buffer, (char *)buffer);
8413 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008414}
8415
Christian Heimes3fd13992008-03-21 01:05:49 +00008416#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008417static int
8418longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8419{
Tim Peters15231542006-02-16 01:08:01 +00008420 Py_ssize_t result;
8421
Neal Norwitzfc76d632006-01-10 06:03:13 +00008422 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008423 result = strtounicode(buffer, (char *)buffer);
8424 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008425}
Christian Heimes3fd13992008-03-21 01:05:49 +00008426#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008427
Guido van Rossum078151d2002-08-11 04:24:12 +00008428/* XXX To save some code duplication, formatfloat/long/int could have been
8429 shared with stringobject.c, converting from 8-bit to Unicode after the
8430 formatting is done. */
8431
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432static int
8433formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008434 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 int flags,
8436 int prec,
8437 int type,
8438 PyObject *v)
8439{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008440 /* fmt = '%#.' + `prec` + `type`
8441 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 char fmt[20];
8443 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008444
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 x = PyFloat_AsDouble(v);
8446 if (x == -1.0 && PyErr_Occurred())
8447 return -1;
8448 if (prec < 0)
8449 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8451 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008452 /* Worst case length calc to ensure no buffer overrun:
8453
8454 'g' formats:
8455 fmt = %#.<prec>g
8456 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8457 for any double rep.)
8458 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8459
8460 'f' formats:
8461 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8462 len = 1 + 50 + 1 + prec = 52 + prec
8463
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008464 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008465 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008466
8467 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008468 if (((type == 'g' || type == 'G') &&
8469 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008470 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008471 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008472 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008473 return -1;
8474 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008475 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8476 (flags&F_ALT) ? "#" : "",
8477 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008478 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479}
8480
Tim Peters38fd5b62000-09-21 05:43:11 +00008481static PyObject*
8482formatlong(PyObject *val, int flags, int prec, int type)
8483{
8484 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008485 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008486 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008487 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008488
Christian Heimes72b710a2008-05-26 13:28:38 +00008489 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008490 if (!str)
8491 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008492 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008493 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008494 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008495}
8496
Christian Heimes3fd13992008-03-21 01:05:49 +00008497#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498static int
8499formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008500 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 int flags,
8502 int prec,
8503 int type,
8504 PyObject *v)
8505{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008506 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008507 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8508 * + 1 + 1
8509 * = 24
8510 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008511 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008512 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 long x;
8514
Christian Heimes217cfd12007-12-02 14:31:20 +00008515 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008517 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008518 if (x < 0 && type == 'u') {
8519 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008520 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008521 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8522 sign = "-";
8523 else
8524 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008526 prec = 1;
8527
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008528 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8529 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008530 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008531 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008532 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008533 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008534 return -1;
8535 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008536
8537 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008538 (type == 'x' || type == 'X' || type == 'o')) {
8539 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008540 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008541 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008542 * - when 0 is being converted, the C standard leaves off
8543 * the '0x' or '0X', which is inconsistent with other
8544 * %#x/%#X conversions and inconsistent with Python's
8545 * hex() function
8546 * - there are platforms that violate the standard and
8547 * convert 0 with the '0x' or '0X'
8548 * (Metrowerks, Compaq Tru64)
8549 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008550 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008551 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008552 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008553 * We can achieve the desired consistency by inserting our
8554 * own '0x' or '0X' prefix, and substituting %x/%X in place
8555 * of %#x/%#X.
8556 *
8557 * Note that this is the same approach as used in
8558 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008559 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008560 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8561 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008562 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008563 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008564 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8565 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008566 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008567 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008568 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008569 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008570 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008571 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
Christian Heimes3fd13992008-03-21 01:05:49 +00008573#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
8575static int
8576formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008577 size_t buflen,
8578 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008580 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008581 if (PyUnicode_Check(v)) {
8582 if (PyUnicode_GET_SIZE(v) != 1)
8583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 else {
8587 /* Integer input truncated to a character */
8588 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008589 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008591 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008592#ifdef Py_UNICODE_WIDE
8593 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008594 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008595 "%c arg not in range(0x110000) "
8596 "(wide Python build)");
8597 return -1;
8598 }
8599#else
8600 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008601 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008602 "%c arg not in range(0x10000) "
8603 "(narrow Python build)");
8604 return -1;
8605 }
8606#endif
8607 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 }
8609 buf[1] = '\0';
8610 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008611
8612 onError:
8613 PyErr_SetString(PyExc_TypeError,
8614 "%c requires int or char");
8615 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008618/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8619
8620 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8621 chars are formatted. XXX This is a magic number. Each formatting
8622 routine does bounds checking to ensure no overflow, but a better
8623 solution may be to malloc a buffer of appropriate size for each
8624 format. For now, the current solution is sufficient.
8625*/
8626#define FORMATBUFLEN (size_t)120
8627
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628PyObject *PyUnicode_Format(PyObject *format,
8629 PyObject *args)
8630{
8631 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008632 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 int args_owned = 0;
8634 PyUnicodeObject *result = NULL;
8635 PyObject *dict = NULL;
8636 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008637
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 if (format == NULL || args == NULL) {
8639 PyErr_BadInternalCall();
8640 return NULL;
8641 }
8642 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008643 if (uformat == NULL)
8644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 fmt = PyUnicode_AS_UNICODE(uformat);
8646 fmtcnt = PyUnicode_GET_SIZE(uformat);
8647
8648 reslen = rescnt = fmtcnt + 100;
8649 result = _PyUnicode_New(reslen);
8650 if (result == NULL)
8651 goto onError;
8652 res = PyUnicode_AS_UNICODE(result);
8653
8654 if (PyTuple_Check(args)) {
8655 arglen = PyTuple_Size(args);
8656 argidx = 0;
8657 }
8658 else {
8659 arglen = -1;
8660 argidx = -2;
8661 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008662 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008663 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 dict = args;
8665
8666 while (--fmtcnt >= 0) {
8667 if (*fmt != '%') {
8668 if (--rescnt < 0) {
8669 rescnt = fmtcnt + 100;
8670 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008671 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8674 --rescnt;
8675 }
8676 *res++ = *fmt++;
8677 }
8678 else {
8679 /* Got a format specifier */
8680 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008681 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 Py_UNICODE c = '\0';
8684 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008685 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 PyObject *v = NULL;
8687 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008688 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008690 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008691 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692
8693 fmt++;
8694 if (*fmt == '(') {
8695 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 PyObject *key;
8698 int pcount = 1;
8699
8700 if (dict == NULL) {
8701 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008702 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 goto onError;
8704 }
8705 ++fmt;
8706 --fmtcnt;
8707 keystart = fmt;
8708 /* Skip over balanced parentheses */
8709 while (pcount > 0 && --fmtcnt >= 0) {
8710 if (*fmt == ')')
8711 --pcount;
8712 else if (*fmt == '(')
8713 ++pcount;
8714 fmt++;
8715 }
8716 keylen = fmt - keystart - 1;
8717 if (fmtcnt < 0 || pcount > 0) {
8718 PyErr_SetString(PyExc_ValueError,
8719 "incomplete format key");
8720 goto onError;
8721 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008722#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008723 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 then looked up since Python uses strings to hold
8725 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008726 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 key = PyUnicode_EncodeUTF8(keystart,
8728 keylen,
8729 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008730#else
8731 key = PyUnicode_FromUnicode(keystart, keylen);
8732#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 if (key == NULL)
8734 goto onError;
8735 if (args_owned) {
8736 Py_DECREF(args);
8737 args_owned = 0;
8738 }
8739 args = PyObject_GetItem(dict, key);
8740 Py_DECREF(key);
8741 if (args == NULL) {
8742 goto onError;
8743 }
8744 args_owned = 1;
8745 arglen = -1;
8746 argidx = -2;
8747 }
8748 while (--fmtcnt >= 0) {
8749 switch (c = *fmt++) {
8750 case '-': flags |= F_LJUST; continue;
8751 case '+': flags |= F_SIGN; continue;
8752 case ' ': flags |= F_BLANK; continue;
8753 case '#': flags |= F_ALT; continue;
8754 case '0': flags |= F_ZERO; continue;
8755 }
8756 break;
8757 }
8758 if (c == '*') {
8759 v = getnextarg(args, arglen, &argidx);
8760 if (v == NULL)
8761 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008762 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 PyErr_SetString(PyExc_TypeError,
8764 "* wants int");
8765 goto onError;
8766 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008767 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008768 if (width == -1 && PyErr_Occurred())
8769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 if (width < 0) {
8771 flags |= F_LJUST;
8772 width = -width;
8773 }
8774 if (--fmtcnt >= 0)
8775 c = *fmt++;
8776 }
8777 else if (c >= '0' && c <= '9') {
8778 width = c - '0';
8779 while (--fmtcnt >= 0) {
8780 c = *fmt++;
8781 if (c < '0' || c > '9')
8782 break;
8783 if ((width*10) / 10 != width) {
8784 PyErr_SetString(PyExc_ValueError,
8785 "width too big");
8786 goto onError;
8787 }
8788 width = width*10 + (c - '0');
8789 }
8790 }
8791 if (c == '.') {
8792 prec = 0;
8793 if (--fmtcnt >= 0)
8794 c = *fmt++;
8795 if (c == '*') {
8796 v = getnextarg(args, arglen, &argidx);
8797 if (v == NULL)
8798 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008799 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 PyErr_SetString(PyExc_TypeError,
8801 "* wants int");
8802 goto onError;
8803 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008804 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008805 if (prec == -1 && PyErr_Occurred())
8806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 if (prec < 0)
8808 prec = 0;
8809 if (--fmtcnt >= 0)
8810 c = *fmt++;
8811 }
8812 else if (c >= '0' && c <= '9') {
8813 prec = c - '0';
8814 while (--fmtcnt >= 0) {
8815 c = Py_CHARMASK(*fmt++);
8816 if (c < '0' || c > '9')
8817 break;
8818 if ((prec*10) / 10 != prec) {
8819 PyErr_SetString(PyExc_ValueError,
8820 "prec too big");
8821 goto onError;
8822 }
8823 prec = prec*10 + (c - '0');
8824 }
8825 }
8826 } /* prec */
8827 if (fmtcnt >= 0) {
8828 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 if (--fmtcnt >= 0)
8830 c = *fmt++;
8831 }
8832 }
8833 if (fmtcnt < 0) {
8834 PyErr_SetString(PyExc_ValueError,
8835 "incomplete format");
8836 goto onError;
8837 }
8838 if (c != '%') {
8839 v = getnextarg(args, arglen, &argidx);
8840 if (v == NULL)
8841 goto onError;
8842 }
8843 sign = 0;
8844 fill = ' ';
8845 switch (c) {
8846
8847 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008848 pbuf = formatbuf;
8849 /* presume that buffer length is at least 1 */
8850 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 len = 1;
8852 break;
8853
8854 case 's':
8855 case 'r':
8856 if (PyUnicode_Check(v) && c == 's') {
8857 temp = v;
8858 Py_INCREF(temp);
8859 }
8860 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008862 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 else
8864 temp = PyObject_Repr(v);
8865 if (temp == NULL)
8866 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008867 if (PyUnicode_Check(temp))
8868 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008869 else {
8870 Py_DECREF(temp);
8871 PyErr_SetString(PyExc_TypeError,
8872 "%s argument has non-string str()");
8873 goto onError;
8874 }
8875 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008876 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 len = PyUnicode_GET_SIZE(temp);
8878 if (prec >= 0 && len > prec)
8879 len = prec;
8880 break;
8881
8882 case 'i':
8883 case 'd':
8884 case 'u':
8885 case 'o':
8886 case 'x':
8887 case 'X':
8888 if (c == 'i')
8889 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008890 isnumok = 0;
8891 if (PyNumber_Check(v)) {
8892 PyObject *iobj=NULL;
8893
8894 if (PyLong_Check(v)) {
8895 iobj = v;
8896 Py_INCREF(iobj);
8897 }
8898 else {
8899 iobj = PyNumber_Long(v);
8900 }
8901 if (iobj!=NULL) {
8902 if (PyLong_Check(iobj)) {
8903 isnumok = 1;
8904 temp = formatlong(iobj, flags, prec, c);
8905 Py_DECREF(iobj);
8906 if (!temp)
8907 goto onError;
8908 pbuf = PyUnicode_AS_UNICODE(temp);
8909 len = PyUnicode_GET_SIZE(temp);
8910 sign = 1;
8911 }
8912 else {
8913 Py_DECREF(iobj);
8914 }
8915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008917 if (!isnumok) {
8918 PyErr_Format(PyExc_TypeError,
8919 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00008920 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008921 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008922 }
8923 if (flags & F_ZERO)
8924 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 break;
8926
8927 case 'e':
8928 case 'E':
8929 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008930 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 case 'g':
8932 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008933 if (c == 'F')
8934 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008935 pbuf = formatbuf;
8936 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8937 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 if (len < 0)
8939 goto onError;
8940 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008941 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 fill = '0';
8943 break;
8944
8945 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008946 pbuf = formatbuf;
8947 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 if (len < 0)
8949 goto onError;
8950 break;
8951
8952 default:
8953 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008954 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008955 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008956 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008957 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008958 (Py_ssize_t)(fmt - 1 -
8959 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 goto onError;
8961 }
8962 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008963 if (*pbuf == '-' || *pbuf == '+') {
8964 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 len--;
8966 }
8967 else if (flags & F_SIGN)
8968 sign = '+';
8969 else if (flags & F_BLANK)
8970 sign = ' ';
8971 else
8972 sign = 0;
8973 }
8974 if (width < len)
8975 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008976 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 reslen -= rescnt;
8978 rescnt = width + fmtcnt + 100;
8979 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008980 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008981 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008982 PyErr_NoMemory();
8983 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008984 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008985 if (_PyUnicode_Resize(&result, reslen) < 0) {
8986 Py_XDECREF(temp);
8987 goto onError;
8988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 res = PyUnicode_AS_UNICODE(result)
8990 + reslen - rescnt;
8991 }
8992 if (sign) {
8993 if (fill != ' ')
8994 *res++ = sign;
8995 rescnt--;
8996 if (width > len)
8997 width--;
8998 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008999 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009000 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009001 assert(pbuf[1] == c);
9002 if (fill != ' ') {
9003 *res++ = *pbuf++;
9004 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009005 }
Tim Petersfff53252001-04-12 18:38:48 +00009006 rescnt -= 2;
9007 width -= 2;
9008 if (width < 0)
9009 width = 0;
9010 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 if (width > len && !(flags & F_LJUST)) {
9013 do {
9014 --rescnt;
9015 *res++ = fill;
9016 } while (--width > len);
9017 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009018 if (fill == ' ') {
9019 if (sign)
9020 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009021 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009022 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009023 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009024 *res++ = *pbuf++;
9025 *res++ = *pbuf++;
9026 }
9027 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009028 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 res += len;
9030 rescnt -= len;
9031 while (--width >= len) {
9032 --rescnt;
9033 *res++ = ' ';
9034 }
9035 if (dict && (argidx < arglen) && c != '%') {
9036 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009037 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009038 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 goto onError;
9040 }
9041 Py_XDECREF(temp);
9042 } /* '%' */
9043 } /* until end */
9044 if (argidx < arglen && !dict) {
9045 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009046 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 goto onError;
9048 }
9049
Thomas Woutersa96affe2006-03-12 00:29:36 +00009050 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 if (args_owned) {
9053 Py_DECREF(args);
9054 }
9055 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 return (PyObject *)result;
9057
9058 onError:
9059 Py_XDECREF(result);
9060 Py_DECREF(uformat);
9061 if (args_owned) {
9062 Py_DECREF(args);
9063 }
9064 return NULL;
9065}
9066
Jeremy Hylton938ace62002-07-17 16:30:39 +00009067static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009068unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9069
Tim Peters6d6c1a32001-08-02 04:15:00 +00009070static PyObject *
9071unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9072{
9073 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009074 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009075 char *encoding = NULL;
9076 char *errors = NULL;
9077
Guido van Rossume023fe02001-08-30 03:12:59 +00009078 if (type != &PyUnicode_Type)
9079 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009080 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009081 kwlist, &x, &encoding, &errors))
9082 return NULL;
9083 if (x == NULL)
9084 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009085 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009086 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009087 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009088 return PyUnicode_FromEncodedObject(x, encoding, errors);
9089}
9090
Guido van Rossume023fe02001-08-30 03:12:59 +00009091static PyObject *
9092unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9093{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009094 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009095 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009096
9097 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9098 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9099 if (tmp == NULL)
9100 return NULL;
9101 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009102 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009103 if (pnew == NULL) {
9104 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009105 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009106 }
Christian Heimesb186d002008-03-18 15:15:01 +00009107 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009108 if (pnew->str == NULL) {
9109 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009110 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009111 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009112 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009113 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009114 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9115 pnew->length = n;
9116 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009117 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009118 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009119}
9120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009121PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009122"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009123\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009124Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009125encoding defaults to the current default string encoding.\n\
9126errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009127
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009128static PyObject *unicode_iter(PyObject *seq);
9129
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009131 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009132 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 sizeof(PyUnicodeObject), /* tp_size */
9134 0, /* tp_itemsize */
9135 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009136 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009138 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009140 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009141 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009142 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009144 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 (hashfunc) unicode_hash, /* tp_hash*/
9146 0, /* tp_call*/
9147 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009148 PyObject_GenericGetAttr, /* tp_getattro */
9149 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009150 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009151 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9152 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009153 unicode_doc, /* tp_doc */
9154 0, /* tp_traverse */
9155 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009156 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009157 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009158 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009159 0, /* tp_iternext */
9160 unicode_methods, /* tp_methods */
9161 0, /* tp_members */
9162 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009163 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009164 0, /* tp_dict */
9165 0, /* tp_descr_get */
9166 0, /* tp_descr_set */
9167 0, /* tp_dictoffset */
9168 0, /* tp_init */
9169 0, /* tp_alloc */
9170 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009171 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172};
9173
9174/* Initialize the Unicode implementation */
9175
Thomas Wouters78890102000-07-22 19:25:51 +00009176void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009178 int i;
9179
Thomas Wouters477c8d52006-05-27 19:21:47 +00009180 /* XXX - move this array to unicodectype.c ? */
9181 Py_UNICODE linebreak[] = {
9182 0x000A, /* LINE FEED */
9183 0x000D, /* CARRIAGE RETURN */
9184 0x001C, /* FILE SEPARATOR */
9185 0x001D, /* GROUP SEPARATOR */
9186 0x001E, /* RECORD SEPARATOR */
9187 0x0085, /* NEXT LINE */
9188 0x2028, /* LINE SEPARATOR */
9189 0x2029, /* PARAGRAPH SEPARATOR */
9190 };
9191
Fred Drakee4315f52000-05-09 19:53:39 +00009192 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009193 free_list = NULL;
9194 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009196 if (!unicode_empty)
9197 return;
9198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009199 for (i = 0; i < 256; i++)
9200 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009201 if (PyType_Ready(&PyUnicode_Type) < 0)
9202 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009203
9204 /* initialize the linebreak bloom filter */
9205 bloom_linebreak = make_bloom_mask(
9206 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9207 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009208
9209 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210}
9211
9212/* Finalize the Unicode implementation */
9213
Christian Heimesa156e092008-02-16 07:38:31 +00009214int
9215PyUnicode_ClearFreeList(void)
9216{
9217 int freelist_size = numfree;
9218 PyUnicodeObject *u;
9219
9220 for (u = free_list; u != NULL;) {
9221 PyUnicodeObject *v = u;
9222 u = *(PyUnicodeObject **)u;
9223 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009224 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009225 Py_XDECREF(v->defenc);
9226 PyObject_Del(v);
9227 numfree--;
9228 }
9229 free_list = NULL;
9230 assert(numfree == 0);
9231 return freelist_size;
9232}
9233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234void
Thomas Wouters78890102000-07-22 19:25:51 +00009235_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009237 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009239 Py_XDECREF(unicode_empty);
9240 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009241
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009242 for (i = 0; i < 256; i++) {
9243 if (unicode_latin1[i]) {
9244 Py_DECREF(unicode_latin1[i]);
9245 unicode_latin1[i] = NULL;
9246 }
9247 }
Christian Heimesa156e092008-02-16 07:38:31 +00009248 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009250
Walter Dörwald16807132007-05-25 13:52:07 +00009251void
9252PyUnicode_InternInPlace(PyObject **p)
9253{
9254 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9255 PyObject *t;
9256 if (s == NULL || !PyUnicode_Check(s))
9257 Py_FatalError(
9258 "PyUnicode_InternInPlace: unicode strings only please!");
9259 /* If it's a subclass, we don't really know what putting
9260 it in the interned dict might do. */
9261 if (!PyUnicode_CheckExact(s))
9262 return;
9263 if (PyUnicode_CHECK_INTERNED(s))
9264 return;
9265 if (interned == NULL) {
9266 interned = PyDict_New();
9267 if (interned == NULL) {
9268 PyErr_Clear(); /* Don't leave an exception */
9269 return;
9270 }
9271 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009272 /* It might be that the GetItem call fails even
9273 though the key is present in the dictionary,
9274 namely when this happens during a stack overflow. */
9275 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009276 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009277 Py_END_ALLOW_RECURSION
9278
Walter Dörwald16807132007-05-25 13:52:07 +00009279 if (t) {
9280 Py_INCREF(t);
9281 Py_DECREF(*p);
9282 *p = t;
9283 return;
9284 }
9285
Martin v. Löwis5b222132007-06-10 09:51:05 +00009286 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009287 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9288 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009289 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009290 return;
9291 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009292 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009293 /* The two references in interned are not counted by refcnt.
9294 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009295 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009296 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9297}
9298
9299void
9300PyUnicode_InternImmortal(PyObject **p)
9301{
9302 PyUnicode_InternInPlace(p);
9303 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9304 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9305 Py_INCREF(*p);
9306 }
9307}
9308
9309PyObject *
9310PyUnicode_InternFromString(const char *cp)
9311{
9312 PyObject *s = PyUnicode_FromString(cp);
9313 if (s == NULL)
9314 return NULL;
9315 PyUnicode_InternInPlace(&s);
9316 return s;
9317}
9318
9319void _Py_ReleaseInternedUnicodeStrings(void)
9320{
9321 PyObject *keys;
9322 PyUnicodeObject *s;
9323 Py_ssize_t i, n;
9324 Py_ssize_t immortal_size = 0, mortal_size = 0;
9325
9326 if (interned == NULL || !PyDict_Check(interned))
9327 return;
9328 keys = PyDict_Keys(interned);
9329 if (keys == NULL || !PyList_Check(keys)) {
9330 PyErr_Clear();
9331 return;
9332 }
9333
9334 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9335 detector, interned unicode strings are not forcibly deallocated;
9336 rather, we give them their stolen references back, and then clear
9337 and DECREF the interned dict. */
9338
9339 n = PyList_GET_SIZE(keys);
9340 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9341 n);
9342 for (i = 0; i < n; i++) {
9343 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9344 switch (s->state) {
9345 case SSTATE_NOT_INTERNED:
9346 /* XXX Shouldn't happen */
9347 break;
9348 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009349 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009350 immortal_size += s->length;
9351 break;
9352 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009353 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009354 mortal_size += s->length;
9355 break;
9356 default:
9357 Py_FatalError("Inconsistent interned string state.");
9358 }
9359 s->state = SSTATE_NOT_INTERNED;
9360 }
9361 fprintf(stderr, "total size of all interned strings: "
9362 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9363 "mortal/immortal\n", mortal_size, immortal_size);
9364 Py_DECREF(keys);
9365 PyDict_Clear(interned);
9366 Py_DECREF(interned);
9367 interned = NULL;
9368}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009369
9370
9371/********************* Unicode Iterator **************************/
9372
9373typedef struct {
9374 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009375 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009376 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9377} unicodeiterobject;
9378
9379static void
9380unicodeiter_dealloc(unicodeiterobject *it)
9381{
9382 _PyObject_GC_UNTRACK(it);
9383 Py_XDECREF(it->it_seq);
9384 PyObject_GC_Del(it);
9385}
9386
9387static int
9388unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9389{
9390 Py_VISIT(it->it_seq);
9391 return 0;
9392}
9393
9394static PyObject *
9395unicodeiter_next(unicodeiterobject *it)
9396{
9397 PyUnicodeObject *seq;
9398 PyObject *item;
9399
9400 assert(it != NULL);
9401 seq = it->it_seq;
9402 if (seq == NULL)
9403 return NULL;
9404 assert(PyUnicode_Check(seq));
9405
9406 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009407 item = PyUnicode_FromUnicode(
9408 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009409 if (item != NULL)
9410 ++it->it_index;
9411 return item;
9412 }
9413
9414 Py_DECREF(seq);
9415 it->it_seq = NULL;
9416 return NULL;
9417}
9418
9419static PyObject *
9420unicodeiter_len(unicodeiterobject *it)
9421{
9422 Py_ssize_t len = 0;
9423 if (it->it_seq)
9424 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009425 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009426}
9427
9428PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9429
9430static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009431 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9432 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009433 {NULL, NULL} /* sentinel */
9434};
9435
9436PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009437 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009438 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009439 sizeof(unicodeiterobject), /* tp_basicsize */
9440 0, /* tp_itemsize */
9441 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009442 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009443 0, /* tp_print */
9444 0, /* tp_getattr */
9445 0, /* tp_setattr */
9446 0, /* tp_compare */
9447 0, /* tp_repr */
9448 0, /* tp_as_number */
9449 0, /* tp_as_sequence */
9450 0, /* tp_as_mapping */
9451 0, /* tp_hash */
9452 0, /* tp_call */
9453 0, /* tp_str */
9454 PyObject_GenericGetAttr, /* tp_getattro */
9455 0, /* tp_setattro */
9456 0, /* tp_as_buffer */
9457 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9458 0, /* tp_doc */
9459 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9460 0, /* tp_clear */
9461 0, /* tp_richcompare */
9462 0, /* tp_weaklistoffset */
9463 PyObject_SelfIter, /* tp_iter */
9464 (iternextfunc)unicodeiter_next, /* tp_iternext */
9465 unicodeiter_methods, /* tp_methods */
9466 0,
9467};
9468
9469static PyObject *
9470unicode_iter(PyObject *seq)
9471{
9472 unicodeiterobject *it;
9473
9474 if (!PyUnicode_Check(seq)) {
9475 PyErr_BadInternalCall();
9476 return NULL;
9477 }
9478 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9479 if (it == NULL)
9480 return NULL;
9481 it->it_index = 0;
9482 Py_INCREF(seq);
9483 it->it_seq = (PyUnicodeObject *)seq;
9484 _PyObject_GC_TRACK(it);
9485 return (PyObject *)it;
9486}
9487
Martin v. Löwis5b222132007-06-10 09:51:05 +00009488size_t
9489Py_UNICODE_strlen(const Py_UNICODE *u)
9490{
9491 int res = 0;
9492 while(*u++)
9493 res++;
9494 return res;
9495}
9496
9497Py_UNICODE*
9498Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9499{
9500 Py_UNICODE *u = s1;
9501 while ((*u++ = *s2++));
9502 return s1;
9503}
9504
9505Py_UNICODE*
9506Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9507{
9508 Py_UNICODE *u = s1;
9509 while ((*u++ = *s2++))
9510 if (n-- == 0)
9511 break;
9512 return s1;
9513}
9514
9515int
9516Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9517{
9518 while (*s1 && *s2 && *s1 == *s2)
9519 s1++, s2++;
9520 if (*s1 && *s2)
9521 return (*s1 < *s2) ? -1 : +1;
9522 if (*s1)
9523 return 1;
9524 if (*s2)
9525 return -1;
9526 return 0;
9527}
9528
9529Py_UNICODE*
9530Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9531{
9532 const Py_UNICODE *p;
9533 for (p = s; *p; p++)
9534 if (*p == c)
9535 return (Py_UNICODE*)p;
9536 return NULL;
9537}
9538
9539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009540#ifdef __cplusplus
9541}
9542#endif
9543
9544
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009545/*
9546Local variables:
9547c-basic-offset: 4
9548indent-tabs-mode: nil
9549End:
9550*/