blob: 60cd9578040058c8c3d26ca28c7e9b0ea098cb07 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Martin v. Löwis18e16552006-02-15 17:27:45 +0000421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422{
423 register PyUnicodeObject *v;
424
425 /* Argument checks */
426 if (unicode == NULL) {
427 PyErr_BadInternalCall();
428 return -1;
429 }
430 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000431 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432 PyErr_BadInternalCall();
433 return -1;
434 }
435
436 /* Resizing unicode_empty and single character objects is not
437 possible since these are being shared. We simply return a fresh
438 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000439 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 (v == unicode_empty || v->length == 1)) {
441 PyUnicodeObject *w = _PyUnicode_New(length);
442 if (w == NULL)
443 return -1;
444 Py_UNICODE_COPY(w->str, v->str,
445 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000446 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 *unicode = (PyObject *)w;
448 return 0;
449 }
450
451 /* Note that we don't have to modify *unicode for unshared Unicode
452 objects, since we can modify them in-place. */
453 return unicode_resize(v, length);
454}
455
456/* Internal API for use in unicodeobject.c only ! */
457#define _PyUnicode_Resize(unicodevar, length) \
458 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
459
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000461 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462{
463 PyUnicodeObject *unicode;
464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000465 /* If the Unicode data is known at construction time, we can apply
466 some optimizations which share commonly used objects. */
467 if (u != NULL) {
468
469 /* Optimization for empty strings */
470 if (size == 0 && unicode_empty != NULL) {
471 Py_INCREF(unicode_empty);
472 return (PyObject *)unicode_empty;
473 }
474
475 /* Single character Unicode objects in the Latin-1 range are
476 shared when using this constructor */
477 if (size == 1 && *u < 256) {
478 unicode = unicode_latin1[*u];
479 if (!unicode) {
480 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 if (!unicode)
482 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000483 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484 unicode_latin1[*u] = unicode;
485 }
486 Py_INCREF(unicode);
487 return (PyObject *)unicode;
488 }
489 }
Tim Petersced69f82003-09-16 20:30:58 +0000490
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the Unicode data into the new object */
496 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000497 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498
499 return (PyObject *)unicode;
500}
501
Walter Dörwaldd2034312007-05-18 16:29:38 +0000502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000503{
504 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000505
506 if (size < 0) {
507 PyErr_SetString(PyExc_SystemError,
508 "Negative size passed to PyUnicode_FromStringAndSize");
509 return NULL;
510 }
511
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000513 some optimizations which share commonly used objects.
514 Also, this means the input must be UTF-8, so fall back to the
515 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (u != NULL) {
517
518 /* Optimization for empty strings */
519 if (size == 0 && unicode_empty != NULL) {
520 Py_INCREF(unicode_empty);
521 return (PyObject *)unicode_empty;
522 }
523
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 /* Single characters are shared when using this constructor.
525 Restrict to ASCII, since the input must be UTF-8. */
526 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000527 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (!unicode) {
529 unicode = _PyUnicode_New(1);
530 if (!unicode)
531 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000532 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000533 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 }
535 Py_INCREF(unicode);
536 return (PyObject *)unicode;
537 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000538
539 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 }
541
Walter Dörwald55507312007-05-18 13:12:10 +0000542 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000543 if (!unicode)
544 return NULL;
545
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000546 return (PyObject *)unicode;
547}
548
Walter Dörwaldd2034312007-05-18 16:29:38 +0000549PyObject *PyUnicode_FromString(const char *u)
550{
551 size_t size = strlen(u);
552 if (size > PY_SSIZE_T_MAX) {
553 PyErr_SetString(PyExc_OverflowError, "input too long");
554 return NULL;
555 }
556
557 return PyUnicode_FromStringAndSize(u, size);
558}
559
Guido van Rossumd57fd912000-03-10 22:53:23 +0000560#ifdef HAVE_WCHAR_H
561
562PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000563 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564{
565 PyUnicodeObject *unicode;
566
567 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000568 if (size == 0)
569 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
Martin v. Löwis790465f2008-04-05 20:41:37 +0000574 if (size == -1) {
575 size = wcslen(w);
576 }
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578 unicode = _PyUnicode_New(size);
579 if (!unicode)
580 return NULL;
581
582 /* Copy the wchar_t data into the new object */
583#ifdef HAVE_USABLE_WCHAR_T
584 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000585#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 {
587 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000590 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 *u++ = *w++;
592 }
593#endif
594
595 return (PyObject *)unicode;
596}
597
Walter Dörwald346737f2007-05-31 10:44:43 +0000598static void
599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
600{
601 *fmt++ = '%';
602 if (width) {
603 if (zeropad)
604 *fmt++ = '0';
605 fmt += sprintf(fmt, "%d", width);
606 }
607 if (precision)
608 fmt += sprintf(fmt, ".%d", precision);
609 if (longflag)
610 *fmt++ = 'l';
611 else if (size_tflag) {
612 char *f = PY_FORMAT_SIZE_T;
613 while (*f)
614 *fmt++ = *f++;
615 }
616 *fmt++ = c;
617 *fmt = '\0';
618}
619
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
621
622PyObject *
623PyUnicode_FromFormatV(const char *format, va_list vargs)
624{
625 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000626 Py_ssize_t callcount = 0;
627 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000628 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000629 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000630 int width = 0;
631 int precision = 0;
632 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000633 const char* f;
634 Py_UNICODE *s;
635 PyObject *string;
636 /* used by sprintf */
637 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000638 /* use abuffer instead of buffer, if we need more space
639 * (which can happen if there's a format specifier with width). */
640 char *abuffer = NULL;
641 char *realbuffer;
642 Py_ssize_t abuffersize = 0;
643 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000644 const char *copy;
645
646#ifdef VA_LIST_IS_ARRAY
647 Py_MEMCPY(count, vargs, sizeof(va_list));
648#else
649#ifdef __va_copy
650 __va_copy(count, vargs);
651#else
652 count = vargs;
653#endif
654#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 /* step 1: count the number of %S/%R/%A format specifications
656 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
657 * these objects once during step 3 and put the result in
658 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000660 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 ++callcount;
662 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000663 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000664 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000665 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000666 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (!callresults) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 callresult = callresults;
672 }
673 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 for (f = format; *f; f++) {
675 if (*f == '%') {
676 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000677 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000678 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681 ;
682
683 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
684 * they don't affect the amount of space we reserve.
685 */
686 if ((*f == 'l' || *f == 'z') &&
687 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000688 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000689
690 switch (*f) {
691 case 'c':
692 (void)va_arg(count, int);
693 /* fall through... */
694 case '%':
695 n++;
696 break;
697 case 'd': case 'u': case 'i': case 'x':
698 (void) va_arg(count, int);
699 /* 20 bytes is enough to hold a 64-bit
700 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000701 This isn't enough for octal.
702 If a width is specified we need more
703 (which we allocate later). */
704 if (width < 20)
705 width = 20;
706 n += width;
707 if (abuffersize < width)
708 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 break;
710 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000711 {
712 /* UTF-8 */
713 unsigned char*s;
714 s = va_arg(count, unsigned char*);
715 while (*s) {
716 if (*s < 128) {
717 n++; s++;
718 } else if (*s < 0xc0) {
719 /* invalid UTF-8 */
720 n++; s++;
721 } else if (*s < 0xc0) {
722 n++;
723 s++; if(!*s)break;
724 s++;
725 } else if (*s < 0xe0) {
726 n++;
727 s++; if(!*s)break;
728 s++; if(!*s)break;
729 s++;
730 } else {
731 #ifdef Py_UNICODE_WIDE
732 n++;
733 #else
734 n+=2;
735 #endif
736 s++; if(!*s)break;
737 s++; if(!*s)break;
738 s++; if(!*s)break;
739 s++;
740 }
741 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 case 'U':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 assert(obj && PyUnicode_Check(obj));
748 n += PyUnicode_GET_SIZE(obj);
749 break;
750 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000751 case 'V':
752 {
753 PyObject *obj = va_arg(count, PyObject *);
754 const char *str = va_arg(count, const char *);
755 assert(obj || str);
756 assert(!obj || PyUnicode_Check(obj));
757 if (obj)
758 n += PyUnicode_GET_SIZE(obj);
759 else
760 n += strlen(str);
761 break;
762 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 case 'S':
764 {
765 PyObject *obj = va_arg(count, PyObject *);
766 PyObject *str;
767 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000768 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000769 if (!str)
770 goto fail;
771 n += PyUnicode_GET_SIZE(str);
772 /* Remember the str and switch to the next slot */
773 *callresult++ = str;
774 break;
775 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000776 case 'R':
777 {
778 PyObject *obj = va_arg(count, PyObject *);
779 PyObject *repr;
780 assert(obj);
781 repr = PyObject_Repr(obj);
782 if (!repr)
783 goto fail;
784 n += PyUnicode_GET_SIZE(repr);
785 /* Remember the repr and switch to the next slot */
786 *callresult++ = repr;
787 break;
788 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000789 case 'A':
790 {
791 PyObject *obj = va_arg(count, PyObject *);
792 PyObject *ascii;
793 assert(obj);
794 ascii = PyObject_ASCII(obj);
795 if (!ascii)
796 goto fail;
797 n += PyUnicode_GET_SIZE(ascii);
798 /* Remember the repr and switch to the next slot */
799 *callresult++ = ascii;
800 break;
801 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 case 'p':
803 (void) va_arg(count, int);
804 /* maximum 64-bit pointer representation:
805 * 0xffffffffffffffff
806 * so 19 characters is enough.
807 * XXX I count 18 -- what's the extra for?
808 */
809 n += 19;
810 break;
811 default:
812 /* if we stumble upon an unknown
813 formatting code, copy the rest of
814 the format string to the output
815 string. (we cannot just skip the
816 code, since there's no way to know
817 what's in the argument list) */
818 n += strlen(p);
819 goto expand;
820 }
821 } else
822 n++;
823 }
824 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000825 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000826 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (!abuffer) {
828 PyErr_NoMemory();
829 goto fail;
830 }
831 realbuffer = abuffer;
832 }
833 else
834 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000835 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 we don't have to resize the string.
838 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000839 string = PyUnicode_FromUnicode(NULL, n);
840 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842
843 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845
846 for (f = format; *f; f++) {
847 if (*f == '%') {
848 const char* p = f++;
849 int longflag = 0;
850 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 zeropad = (*f == '0');
852 /* parse the width.precision part */
853 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 width = (width*10) + *f++ - '0';
856 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 if (*f == '.') {
858 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000859 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 /* handle the long flag, but only for %ld and %lu.
863 others can be added when necessary. */
864 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
865 longflag = 1;
866 ++f;
867 }
868 /* handle the size_t flag. */
869 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
870 size_tflag = 1;
871 ++f;
872 }
873
874 switch (*f) {
875 case 'c':
876 *s++ = va_arg(vargs, int);
877 break;
878 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000879 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000880 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, int));
886 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
888 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000889 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000890 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000904 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
905 sprintf(realbuffer, fmt, va_arg(vargs, int));
906 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907 break;
908 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000909 {
910 /* Parameter must be UTF-8 encoded.
911 In case of encoding errors, use
912 the replacement character. */
913 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000914 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000915 u = PyUnicode_DecodeUTF8(p, strlen(p),
916 "replace");
917 if (!u)
918 goto fail;
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
920 PyUnicode_GET_SIZE(u));
921 s += PyUnicode_GET_SIZE(u);
922 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000923 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000924 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 case 'U':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000928 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
929 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
930 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000931 break;
932 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000933 case 'V':
934 {
935 PyObject *obj = va_arg(vargs, PyObject *);
936 const char *str = va_arg(vargs, const char *);
937 if (obj) {
938 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
939 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
940 s += size;
941 } else {
942 appendstring(str);
943 }
944 break;
945 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000946 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 case 'R':
948 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 Py_UNICODE *ucopy;
950 Py_ssize_t usize;
951 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 /* unused, since we already have the result */
953 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000954 ucopy = PyUnicode_AS_UNICODE(*callresult);
955 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 for (upos = 0; upos<usize;)
957 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000958 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000959 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 ++callresult;
962 break;
963 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000964 case 'p':
965 sprintf(buffer, "%p", va_arg(vargs, void*));
966 /* %p is ill-defined: ensure leading 0x. */
967 if (buffer[1] == 'X')
968 buffer[1] = 'x';
969 else if (buffer[1] != 'x') {
970 memmove(buffer+2, buffer, strlen(buffer)+1);
971 buffer[0] = '0';
972 buffer[1] = 'x';
973 }
974 appendstring(buffer);
975 break;
976 case '%':
977 *s++ = '%';
978 break;
979 default:
980 appendstring(p);
981 goto end;
982 }
983 } else
984 *s++ = *f;
985 }
986
987 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000988 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000989 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000990 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000992 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
993 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000994 fail:
995 if (callresults) {
996 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000997 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000998 Py_DECREF(*callresult2);
999 ++callresult2;
1000 }
Christian Heimesb186d002008-03-18 15:15:01 +00001001 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001002 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001003 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001004 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001005 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001006}
1007
1008#undef appendstring
1009
1010PyObject *
1011PyUnicode_FromFormat(const char *format, ...)
1012{
1013 PyObject* ret;
1014 va_list vargs;
1015
1016#ifdef HAVE_STDARG_PROTOTYPES
1017 va_start(vargs, format);
1018#else
1019 va_start(vargs);
1020#endif
1021 ret = PyUnicode_FromFormatV(format, vargs);
1022 va_end(vargs);
1023 return ret;
1024}
1025
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1027 wchar_t *w,
1028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029{
1030 if (unicode == NULL) {
1031 PyErr_BadInternalCall();
1032 return -1;
1033 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001034
1035 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037 size = PyUnicode_GET_SIZE(unicode) + 1;
1038
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039#ifdef HAVE_USABLE_WCHAR_T
1040 memcpy(w, unicode->str, size * sizeof(wchar_t));
1041#else
1042 {
1043 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001044 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001046 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 *w++ = *u++;
1048 }
1049#endif
1050
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001051 if (size > PyUnicode_GET_SIZE(unicode))
1052 return PyUnicode_GET_SIZE(unicode);
1053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 return size;
1055}
1056
1057#endif
1058
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059PyObject *PyUnicode_FromOrdinal(int ordinal)
1060{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001061 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001063 if (ordinal < 0 || ordinal > 0x10ffff) {
1064 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001065 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001066 return NULL;
1067 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001068
1069#ifndef Py_UNICODE_WIDE
1070 if (ordinal > 0xffff) {
1071 ordinal -= 0x10000;
1072 s[0] = 0xD800 | (ordinal >> 10);
1073 s[1] = 0xDC00 | (ordinal & 0x3FF);
1074 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075 }
1076#endif
1077
Hye-Shik Chang40574832004-04-06 07:24:51 +00001078 s[0] = (Py_UNICODE)ordinal;
1079 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001080}
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082PyObject *PyUnicode_FromObject(register PyObject *obj)
1083{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001085 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 if (PyUnicode_CheckExact(obj)) {
1087 Py_INCREF(obj);
1088 return obj;
1089 }
1090 if (PyUnicode_Check(obj)) {
1091 /* For a Unicode subtype that's not a Unicode object,
1092 return a true Unicode object with the same data. */
1093 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1094 PyUnicode_GET_SIZE(obj));
1095 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001096 PyErr_Format(PyExc_TypeError,
1097 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001098 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100}
1101
1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1103 const char *encoding,
1104 const char *errors)
1105{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001107 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (obj == NULL) {
1111 PyErr_BadInternalCall();
1112 return NULL;
1113 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115 if (PyUnicode_Check(obj)) {
1116 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001117 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120
1121 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001122 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001123 s = PyBytes_AS_STRING(obj);
1124 len = PyBytes_GET_SIZE(obj);
1125 }
1126 else if (PyByteArray_Check(obj)) {
1127 s = PyByteArray_AS_STRING(obj);
1128 len = PyByteArray_GET_SIZE(obj);
1129 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001130 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1131 /* Overwrite the error message with something more useful in
1132 case of a TypeError. */
1133 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001134 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001135 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001137 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001138 goto onError;
1139 }
Tim Petersced69f82003-09-16 20:30:58 +00001140
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001141 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 if (len == 0) {
1143 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 }
Tim Petersced69f82003-09-16 20:30:58 +00001146 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001148
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 return v;
1150
1151 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153}
1154
1155PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 const char *encoding,
1158 const char *errors)
1159{
1160 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001161 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 char lower[20]; /* Enough for any encoding name we recognize */
1163 char *l;
1164 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165
1166 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Convert encoding to lower case and replace '_' with '-' in order to
1170 catch e.g. UTF_8 */
1171 e = encoding;
1172 l = lower;
1173 while (*e && l < &lower[(sizeof lower) - 2]) {
1174 if (ISUPPER(*e)) {
1175 *l++ = TOLOWER(*e++);
1176 }
1177 else if (*e == '_') {
1178 *l++ = '-';
1179 e++;
1180 }
1181 else {
1182 *l++ = *e++;
1183 }
1184 }
1185 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001186
1187 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001188 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 else if ((strcmp(lower, "latin-1") == 0) ||
1191 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001197 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "utf-16") == 0)
1200 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1201 else if (strcmp(lower, "utf-32") == 0)
1202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203
1204 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001205 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001208 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (buffer == NULL)
1210 goto onError;
1211 unicode = PyCodec_Decode(buffer, encoding, errors);
1212 if (unicode == NULL)
1213 goto onError;
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001216 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001217 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_DECREF(unicode);
1219 goto onError;
1220 }
1221 Py_DECREF(buffer);
1222 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 onError:
1225 Py_XDECREF(buffer);
1226 return NULL;
1227}
1228
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1230 const char *encoding,
1231 const char *errors)
1232{
1233 PyObject *v;
1234
1235 if (!PyUnicode_Check(unicode)) {
1236 PyErr_BadArgument();
1237 goto onError;
1238 }
1239
1240 if (encoding == NULL)
1241 encoding = PyUnicode_GetDefaultEncoding();
1242
1243 /* Decode via the codec registry */
1244 v = PyCodec_Decode(unicode, encoding, errors);
1245 if (v == NULL)
1246 goto onError;
1247 return v;
1248
1249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1254 const char *encoding,
1255 const char *errors)
1256{
1257 PyObject *v;
1258
1259 if (!PyUnicode_Check(unicode)) {
1260 PyErr_BadArgument();
1261 goto onError;
1262 }
1263
1264 if (encoding == NULL)
1265 encoding = PyUnicode_GetDefaultEncoding();
1266
1267 /* Decode via the codec registry */
1268 v = PyCodec_Decode(unicode, encoding, errors);
1269 if (v == NULL)
1270 goto onError;
1271 if (!PyUnicode_Check(v)) {
1272 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001273 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001274 Py_TYPE(v)->tp_name);
1275 Py_DECREF(v);
1276 goto onError;
1277 }
1278 return v;
1279
1280 onError:
1281 return NULL;
1282}
1283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 unicode = PyUnicode_FromUnicode(s, size);
1292 if (unicode == NULL)
1293 return NULL;
1294 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1295 Py_DECREF(unicode);
1296 return v;
1297}
1298
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1300 const char *encoding,
1301 const char *errors)
1302{
1303 PyObject *v;
1304
1305 if (!PyUnicode_Check(unicode)) {
1306 PyErr_BadArgument();
1307 goto onError;
1308 }
1309
1310 if (encoding == NULL)
1311 encoding = PyUnicode_GetDefaultEncoding();
1312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 return v;
1318
1319 onError:
1320 return NULL;
1321}
1322
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1324 const char *encoding,
1325 const char *errors)
1326{
1327 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 if (!PyUnicode_Check(unicode)) {
1330 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 }
Fred Drakee4315f52000-05-09 19:53:39 +00001333
Tim Petersced69f82003-09-16 20:30:58 +00001334 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001335 encoding = PyUnicode_GetDefaultEncoding();
1336
1337 /* Shortcuts for common default encodings */
1338 if (errors == NULL) {
1339 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001340 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001341 else if (strcmp(encoding, "latin-1") == 0)
1342 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1344 else if (strcmp(encoding, "mbcs") == 0)
1345 return PyUnicode_AsMBCSString(unicode);
1346#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001347 else if (strcmp(encoding, "ascii") == 0)
1348 return PyUnicode_AsASCIIString(unicode);
1349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350
1351 /* Encode via the codec registry */
1352 v = PyCodec_Encode(unicode, encoding, errors);
1353 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001354 return NULL;
1355
1356 /* The normal path */
1357 if (PyBytes_Check(v))
1358 return v;
1359
1360 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001361 if (PyByteArray_Check(v)) {
1362 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001363 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001364 PyOS_snprintf(msg, sizeof(msg),
1365 "encoder %s returned buffer instead of bytes",
1366 encoding);
1367 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001368 Py_DECREF(v);
1369 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001370 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001371
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001372 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1373 Py_DECREF(v);
1374 return b;
1375 }
1376
1377 PyErr_Format(PyExc_TypeError,
1378 "encoder did not return a bytes object (type=%.400s)",
1379 Py_TYPE(v)->tp_name);
1380 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001381 return NULL;
1382}
1383
1384PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1385 const char *encoding,
1386 const char *errors)
1387{
1388 PyObject *v;
1389
1390 if (!PyUnicode_Check(unicode)) {
1391 PyErr_BadArgument();
1392 goto onError;
1393 }
1394
1395 if (encoding == NULL)
1396 encoding = PyUnicode_GetDefaultEncoding();
1397
1398 /* Encode via the codec registry */
1399 v = PyCodec_Encode(unicode, encoding, errors);
1400 if (v == NULL)
1401 goto onError;
1402 if (!PyUnicode_Check(v)) {
1403 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001404 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001405 Py_TYPE(v)->tp_name);
1406 Py_DECREF(v);
1407 goto onError;
1408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001410
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 onError:
1412 return NULL;
1413}
1414
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001415PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1416 const char *errors)
1417{
1418 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001419 if (v)
1420 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001421 if (errors != NULL)
1422 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001423 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001424 PyUnicode_GET_SIZE(unicode),
1425 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001426 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001427 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001428 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001429 return v;
1430}
1431
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001432PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001433PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001434 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001435 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1436}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001437
Christian Heimes5894ba72007-11-04 11:43:14 +00001438PyObject*
1439PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1440{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001441 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1442 can be undefined. If it is case, decode using UTF-8. The following assumes
1443 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1444 bootstrapping process where the codecs aren't ready yet.
1445 */
1446 if (Py_FileSystemDefaultEncoding) {
1447#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001448 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 return PyUnicode_DecodeMBCS(s, size, "replace");
1450 }
1451#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001452 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001453 return PyUnicode_DecodeUTF8(s, size, "replace");
1454 }
1455#endif
1456 return PyUnicode_Decode(s, size,
1457 Py_FileSystemDefaultEncoding,
1458 "replace");
1459 }
1460 else {
1461 return PyUnicode_DecodeUTF8(s, size, "replace");
1462 }
1463}
1464
Martin v. Löwis5b222132007-06-10 09:51:05 +00001465char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001466_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001467{
Christian Heimesf3863112007-11-22 07:46:41 +00001468 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001469 if (!PyUnicode_Check(unicode)) {
1470 PyErr_BadArgument();
1471 return NULL;
1472 }
Christian Heimesf3863112007-11-22 07:46:41 +00001473 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1474 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001475 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001476 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001477 *psize = PyBytes_GET_SIZE(bytes);
1478 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001479}
1480
1481char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001482_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001483{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001484 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001485}
1486
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1488{
1489 if (!PyUnicode_Check(unicode)) {
1490 PyErr_BadArgument();
1491 goto onError;
1492 }
1493 return PyUnicode_AS_UNICODE(unicode);
1494
1495 onError:
1496 return NULL;
1497}
1498
Martin v. Löwis18e16552006-02-15 17:27:45 +00001499Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500{
1501 if (!PyUnicode_Check(unicode)) {
1502 PyErr_BadArgument();
1503 goto onError;
1504 }
1505 return PyUnicode_GET_SIZE(unicode);
1506
1507 onError:
1508 return -1;
1509}
1510
Thomas Wouters78890102000-07-22 19:25:51 +00001511const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001512{
1513 return unicode_default_encoding;
1514}
1515
1516int PyUnicode_SetDefaultEncoding(const char *encoding)
1517{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001518 if (strcmp(encoding, unicode_default_encoding) != 0) {
1519 PyErr_Format(PyExc_ValueError,
1520 "Can only set default encoding to %s",
1521 unicode_default_encoding);
1522 return -1;
1523 }
Fred Drakee4315f52000-05-09 19:53:39 +00001524 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001525}
1526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527/* error handling callback helper:
1528 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001529 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 and adjust various state variables.
1531 return 0 on success, -1 on error
1532*/
1533
1534static
1535int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1536 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001537 const char **input, const char **inend, Py_ssize_t *startinpos,
1538 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001541 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542
1543 PyObject *restuple = NULL;
1544 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001545 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001546 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001547 Py_ssize_t requiredsize;
1548 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001550 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001551 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 int res = -1;
1553
1554 if (*errorHandler == NULL) {
1555 *errorHandler = PyCodec_LookupError(errors);
1556 if (*errorHandler == NULL)
1557 goto onError;
1558 }
1559
1560 if (*exceptionObject == NULL) {
1561 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001562 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 if (*exceptionObject == NULL)
1564 goto onError;
1565 }
1566 else {
1567 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1568 goto onError;
1569 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1570 goto onError;
1571 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1572 goto onError;
1573 }
1574
1575 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1576 if (restuple == NULL)
1577 goto onError;
1578 if (!PyTuple_Check(restuple)) {
1579 PyErr_Format(PyExc_TypeError, &argparse[4]);
1580 goto onError;
1581 }
1582 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1583 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001584
1585 /* Copy back the bytes variables, which might have been modified by the
1586 callback */
1587 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1588 if (!inputobj)
1589 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001590 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001591 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1592 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001593 *input = PyBytes_AS_STRING(inputobj);
1594 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001595 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001596 /* we can DECREF safely, as the exception has another reference,
1597 so the object won't go away. */
1598 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001601 newpos = insize+newpos;
1602 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001603 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001604 goto onError;
1605 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606
1607 /* need more space? (at least enough for what we
1608 have+the replacement+the rest of the string (starting
1609 at the new input position), so we won't have to check space
1610 when there are no errors in the rest of the string) */
1611 repptr = PyUnicode_AS_UNICODE(repunicode);
1612 repsize = PyUnicode_GET_SIZE(repunicode);
1613 requiredsize = *outpos + repsize + insize-newpos;
1614 if (requiredsize > outsize) {
1615 if (requiredsize<2*outsize)
1616 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 goto onError;
1619 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1620 }
1621 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001622 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001623 Py_UNICODE_COPY(*outptr, repptr, repsize);
1624 *outptr += repsize;
1625 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 /* we made it! */
1628 res = 0;
1629
1630 onError:
1631 Py_XDECREF(restuple);
1632 return res;
1633}
1634
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635/* --- UTF-7 Codec -------------------------------------------------------- */
1636
1637/* see RFC2152 for details */
1638
Tim Petersced69f82003-09-16 20:30:58 +00001639static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640char utf7_special[128] = {
1641 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1642 encoded:
1643 0 - not special
1644 1 - special
1645 2 - whitespace (optional)
1646 3 - RFC2152 Set O (optional) */
1647 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1649 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1651 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1653 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1654 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1655
1656};
1657
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001658/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1659 warnings about the comparison always being false; since
1660 utf7_special[0] is 1, we can safely make that one comparison
1661 true */
1662
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001664 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001665 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 (encodeO && (utf7_special[(c)] == 3)))
1667
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001668#define B64(n) \
1669 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1670#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001671 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001672#define UB64(c) \
1673 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1674 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001676#define ENCODE(out, ch, bits) \
1677 while (bits >= 6) { \
1678 *out++ = B64(ch >> (bits-6)); \
1679 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 }
1681
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001682#define DECODE(out, ch, bits, surrogate) \
1683 while (bits >= 16) { \
1684 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1685 bits -= 16; \
1686 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001687 /* We have already generated an error for the high surrogate \
1688 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001689 surrogate = 0; \
1690 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001692 it in a 16-bit character */ \
1693 surrogate = 1; \
1694 errmsg = "code pairs are not supported"; \
1695 goto utf7Error; \
1696 } else { \
1697 *out++ = outCh; \
1698 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001699 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001701PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001702 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703 const char *errors)
1704{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001705 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1706}
1707
1708PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1709 Py_ssize_t size,
1710 const char *errors,
1711 Py_ssize_t *consumed)
1712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001714 Py_ssize_t startinpos;
1715 Py_ssize_t endinpos;
1716 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 const char *e;
1718 PyUnicodeObject *unicode;
1719 Py_UNICODE *p;
1720 const char *errmsg = "";
1721 int inShift = 0;
1722 unsigned int bitsleft = 0;
1723 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 int surrogate = 0;
1725 PyObject *errorHandler = NULL;
1726 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001727
1728 unicode = _PyUnicode_New(size);
1729 if (!unicode)
1730 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001731 if (size == 0) {
1732 if (consumed)
1733 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001735 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001736
1737 p = unicode->str;
1738 e = s + size;
1739
1740 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001741 Py_UNICODE ch;
1742 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001743 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
1745 if (inShift) {
1746 if ((ch == '-') || !B64CHAR(ch)) {
1747 inShift = 0;
1748 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001749
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1751 if (bitsleft >= 6) {
1752 /* The shift sequence has a partial character in it. If
1753 bitsleft < 6 then we could just classify it as padding
1754 but that is not the case here */
1755
1756 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001757 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001758 }
1759 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001760 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 here so indicate the potential of a misencoded character. */
1762
1763 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1764 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1765 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001766 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 }
1768
1769 if (ch == '-') {
1770 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001771 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 inShift = 1;
1773 }
1774 } else if (SPECIAL(ch,0,0)) {
1775 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001776 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 } else {
1778 *p++ = ch;
1779 }
1780 } else {
1781 charsleft = (charsleft << 6) | UB64(ch);
1782 bitsleft += 6;
1783 s++;
1784 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1785 }
1786 }
1787 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001789 s++;
1790 if (s < e && *s == '-') {
1791 s++;
1792 *p++ = '+';
1793 } else
1794 {
1795 inShift = 1;
1796 bitsleft = 0;
1797 }
1798 }
1799 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001800 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001801 errmsg = "unexpected special character";
1802 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001803 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 }
1805 else {
1806 *p++ = ch;
1807 s++;
1808 }
1809 continue;
1810 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 outpos = p-PyUnicode_AS_UNICODE(unicode);
1812 endinpos = s-starts;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 (PyObject **)&unicode, &outpos, &p))
1818 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001821 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 outpos = p-PyUnicode_AS_UNICODE(unicode);
1823 endinpos = size;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001827 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 if (s < e)
1831 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001833 if (consumed) {
1834 if(inShift)
1835 *consumed = startinpos;
1836 else
1837 *consumed = s-starts;
1838 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001840 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841 goto onError;
1842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 Py_XDECREF(errorHandler);
1844 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 return (PyObject *)unicode;
1846
1847onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 Py_XDECREF(errorHandler);
1849 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850 Py_DECREF(unicode);
1851 return NULL;
1852}
1853
1854
1855PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001856 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001857 int encodeSetO,
1858 int encodeWhiteSpace,
1859 const char *errors)
1860{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001861 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001862 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001863 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001865 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001866 unsigned int bitsleft = 0;
1867 unsigned long charsleft = 0;
1868 char * out;
1869 char * start;
1870
1871 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001872 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001873
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001874 if (cbAllocated / 5 != size)
1875 return PyErr_NoMemory();
1876
Christian Heimes9c4756e2008-05-26 13:22:05 +00001877 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878 if (v == NULL)
1879 return NULL;
1880
Christian Heimes9c4756e2008-05-26 13:22:05 +00001881 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001882 for (;i < size; ++i) {
1883 Py_UNICODE ch = s[i];
1884
1885 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001886 if (ch == '+') {
1887 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888 *out++ = '-';
1889 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1890 charsleft = ch;
1891 bitsleft = 16;
1892 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001893 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001894 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001895 } else {
1896 *out++ = (char) ch;
1897 }
1898 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001899 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1900 *out++ = B64(charsleft << (6-bitsleft));
1901 charsleft = 0;
1902 bitsleft = 0;
1903 /* Characters not in the BASE64 set implicitly unshift the sequence
1904 so no '-' is required, except if the character is itself a '-' */
1905 if (B64CHAR(ch) || ch == '-') {
1906 *out++ = '-';
1907 }
1908 inShift = 0;
1909 *out++ = (char) ch;
1910 } else {
1911 bitsleft += 16;
1912 charsleft = (charsleft << 16) | ch;
1913 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1914
1915 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001916 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001917 or '-' then the shift sequence will be terminated implicitly and we
1918 don't have to insert a '-'. */
1919
1920 if (bitsleft == 0) {
1921 if (i + 1 < size) {
1922 Py_UNICODE ch2 = s[i+1];
1923
1924 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001925
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 } else if (B64CHAR(ch2) || ch2 == '-') {
1927 *out++ = '-';
1928 inShift = 0;
1929 } else {
1930 inShift = 0;
1931 }
1932
1933 }
1934 else {
1935 *out++ = '-';
1936 inShift = 0;
1937 }
1938 }
Tim Petersced69f82003-09-16 20:30:58 +00001939 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001940 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001941 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001942 if (bitsleft) {
1943 *out++= B64(charsleft << (6-bitsleft) );
1944 *out++ = '-';
1945 }
1946
Christian Heimes72b710a2008-05-26 13:28:38 +00001947 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001948 Py_DECREF(v);
1949 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950}
1951
1952#undef SPECIAL
1953#undef B64
1954#undef B64CHAR
1955#undef UB64
1956#undef ENCODE
1957#undef DECODE
1958
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959/* --- UTF-8 Codec -------------------------------------------------------- */
1960
Tim Petersced69f82003-09-16 20:30:58 +00001961static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962char utf8_code_length[256] = {
1963 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1964 illegal prefix. see RFC 2279 for details */
1965 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1966 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1967 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1968 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1969 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1970 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1971 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1972 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1973 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1974 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1975 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1976 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1977 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1978 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1979 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1980 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1981};
1982
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 const char *errors)
1986{
Walter Dörwald69652032004-09-07 20:24:22 +00001987 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1988}
1989
1990PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001991 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001992 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001993 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001994{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997 Py_ssize_t startinpos;
1998 Py_ssize_t endinpos;
1999 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 const char *e;
2001 PyUnicodeObject *unicode;
2002 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002003 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 PyObject *errorHandler = NULL;
2005 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006
2007 /* Note: size will always be longer than the resulting Unicode
2008 character count */
2009 unicode = _PyUnicode_New(size);
2010 if (!unicode)
2011 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002012 if (size == 0) {
2013 if (consumed)
2014 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017
2018 /* Unpack UTF-8 encoded data */
2019 p = unicode->str;
2020 e = s + size;
2021
2022 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024
2025 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002026 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027 s++;
2028 continue;
2029 }
2030
2031 n = utf8_code_length[ch];
2032
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002034 if (consumed)
2035 break;
2036 else {
2037 errmsg = "unexpected end of data";
2038 startinpos = s-starts;
2039 endinpos = size;
2040 goto utf8Error;
2041 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
2044 switch (n) {
2045
2046 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002047 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 startinpos = s-starts;
2049 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002050 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002053 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 startinpos = s-starts;
2055 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002056 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002059 if ((s[1] & 0xc0) != 0x80) {
2060 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 startinpos = s-starts;
2062 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002063 goto utf8Error;
2064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002066 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 startinpos = s-starts;
2068 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002069 errmsg = "illegal encoding";
2070 goto utf8Error;
2071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002073 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 break;
2075
2076 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002077 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002078 (s[2] & 0xc0) != 0x80) {
2079 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
2081 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002082 goto utf8Error;
2083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085 if (ch < 0x0800) {
2086 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002087 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002088
2089 XXX For wide builds (UCS-4) we should probably try
2090 to recombine the surrogates into a single code
2091 unit.
2092 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002093 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 startinpos = s-starts;
2095 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002096 goto utf8Error;
2097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002099 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002100 break;
2101
2102 case 4:
2103 if ((s[1] & 0xc0) != 0x80 ||
2104 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 (s[3] & 0xc0) != 0x80) {
2106 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 startinpos = s-starts;
2108 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002109 goto utf8Error;
2110 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002111 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2112 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2113 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002115 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002117 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002118 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002119 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = s-starts;
2121 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002122 goto utf8Error;
2123 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002124#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002125 *p++ = (Py_UNICODE)ch;
2126#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002127 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002128
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002129 /* translate from 10000..10FFFF to 0..FFFF */
2130 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002131
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002132 /* high surrogate = top 10 bits added to D800 */
2133 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002134
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002136 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002137#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 break;
2139
2140 default:
2141 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002142 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 startinpos = s-starts;
2144 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002145 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 }
2147 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002148 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002149
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002150 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 outpos = p-PyUnicode_AS_UNICODE(unicode);
2152 if (unicode_decode_call_errorhandler(
2153 errors, &errorHandler,
2154 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002155 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 (PyObject **)&unicode, &outpos, &p))
2157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 }
Walter Dörwald69652032004-09-07 20:24:22 +00002159 if (consumed)
2160 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161
2162 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002163 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 goto onError;
2165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 Py_XDECREF(errorHandler);
2167 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 return (PyObject *)unicode;
2169
2170onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 Py_XDECREF(errorHandler);
2172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 Py_DECREF(unicode);
2174 return NULL;
2175}
2176
Tim Peters602f7402002-04-27 18:03:26 +00002177/* Allocation strategy: if the string is short, convert into a stack buffer
2178 and allocate exactly as much space needed at the end. Else allocate the
2179 maximum possible needed (4 result bytes per Unicode character), and return
2180 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002181*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002182PyObject *
2183PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002185 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186{
Tim Peters602f7402002-04-27 18:03:26 +00002187#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002188
Guido van Rossum98297ee2007-11-06 21:34:58 +00002189 Py_ssize_t i; /* index into s of next input byte */
2190 PyObject *result; /* result string object */
2191 char *p; /* next free byte in output buffer */
2192 Py_ssize_t nallocated; /* number of result bytes allocated */
2193 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002194 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002195
Tim Peters602f7402002-04-27 18:03:26 +00002196 assert(s != NULL);
2197 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198
Tim Peters602f7402002-04-27 18:03:26 +00002199 if (size <= MAX_SHORT_UNICHARS) {
2200 /* Write into the stack buffer; nallocated can't overflow.
2201 * At the end, we'll allocate exactly as much heap space as it
2202 * turns out we need.
2203 */
2204 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002205 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002206 p = stackbuf;
2207 }
2208 else {
2209 /* Overallocate on the heap, and give the excess back at the end. */
2210 nallocated = size * 4;
2211 if (nallocated / 4 != size) /* overflow! */
2212 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002213 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002214 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002215 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002216 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002217 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002218
Tim Peters602f7402002-04-27 18:03:26 +00002219 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002220 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002221
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002222 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002223 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002225
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002227 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002228 *p++ = (char)(0xc0 | (ch >> 6));
2229 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002230 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002231 else {
Tim Peters602f7402002-04-27 18:03:26 +00002232 /* Encode UCS2 Unicode ordinals */
2233 if (ch < 0x10000) {
2234 /* Special case: check for high surrogate */
2235 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2236 Py_UCS4 ch2 = s[i];
2237 /* Check for low surrogate and combine the two to
2238 form a UCS4 value */
2239 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002240 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002241 i++;
2242 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 }
Tim Peters602f7402002-04-27 18:03:26 +00002244 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002245 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002246 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002247 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2248 *p++ = (char)(0x80 | (ch & 0x3f));
2249 continue;
2250 }
2251encodeUCS4:
2252 /* Encode UCS4 Unicode ordinals */
2253 *p++ = (char)(0xf0 | (ch >> 18));
2254 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2255 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2256 *p++ = (char)(0x80 | (ch & 0x3f));
2257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002259
Guido van Rossum98297ee2007-11-06 21:34:58 +00002260 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002261 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002262 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002263 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002264 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002265 }
2266 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002267 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002268 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002269 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002270 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002271 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002272 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002273
Tim Peters602f7402002-04-27 18:03:26 +00002274#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275}
2276
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2278{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 if (!PyUnicode_Check(unicode)) {
2280 PyErr_BadArgument();
2281 return NULL;
2282 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002283 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2284 PyUnicode_GET_SIZE(unicode),
2285 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286}
2287
Walter Dörwald41980ca2007-08-16 21:55:45 +00002288/* --- UTF-32 Codec ------------------------------------------------------- */
2289
2290PyObject *
2291PyUnicode_DecodeUTF32(const char *s,
2292 Py_ssize_t size,
2293 const char *errors,
2294 int *byteorder)
2295{
2296 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2297}
2298
2299PyObject *
2300PyUnicode_DecodeUTF32Stateful(const char *s,
2301 Py_ssize_t size,
2302 const char *errors,
2303 int *byteorder,
2304 Py_ssize_t *consumed)
2305{
2306 const char *starts = s;
2307 Py_ssize_t startinpos;
2308 Py_ssize_t endinpos;
2309 Py_ssize_t outpos;
2310 PyUnicodeObject *unicode;
2311 Py_UNICODE *p;
2312#ifndef Py_UNICODE_WIDE
2313 int i, pairs;
2314#else
2315 const int pairs = 0;
2316#endif
2317 const unsigned char *q, *e;
2318 int bo = 0; /* assume native ordering by default */
2319 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002320 /* Offsets from q for retrieving bytes in the right order. */
2321#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2322 int iorder[] = {0, 1, 2, 3};
2323#else
2324 int iorder[] = {3, 2, 1, 0};
2325#endif
2326 PyObject *errorHandler = NULL;
2327 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002328 /* On narrow builds we split characters outside the BMP into two
2329 codepoints => count how much extra space we need. */
2330#ifndef Py_UNICODE_WIDE
2331 for (i = pairs = 0; i < size/4; i++)
2332 if (((Py_UCS4 *)s)[i] >= 0x10000)
2333 pairs++;
2334#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002335
2336 /* This might be one to much, because of a BOM */
2337 unicode = _PyUnicode_New((size+3)/4+pairs);
2338 if (!unicode)
2339 return NULL;
2340 if (size == 0)
2341 return (PyObject *)unicode;
2342
2343 /* Unpack UTF-32 encoded data */
2344 p = unicode->str;
2345 q = (unsigned char *)s;
2346 e = q + size;
2347
2348 if (byteorder)
2349 bo = *byteorder;
2350
2351 /* Check for BOM marks (U+FEFF) in the input and adjust current
2352 byte order setting accordingly. In native mode, the leading BOM
2353 mark is skipped, in all other modes, it is copied to the output
2354 stream as-is (giving a ZWNBSP character). */
2355 if (bo == 0) {
2356 if (size >= 4) {
2357 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2358 (q[iorder[1]] << 8) | q[iorder[0]];
2359#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2360 if (bom == 0x0000FEFF) {
2361 q += 4;
2362 bo = -1;
2363 }
2364 else if (bom == 0xFFFE0000) {
2365 q += 4;
2366 bo = 1;
2367 }
2368#else
2369 if (bom == 0x0000FEFF) {
2370 q += 4;
2371 bo = 1;
2372 }
2373 else if (bom == 0xFFFE0000) {
2374 q += 4;
2375 bo = -1;
2376 }
2377#endif
2378 }
2379 }
2380
2381 if (bo == -1) {
2382 /* force LE */
2383 iorder[0] = 0;
2384 iorder[1] = 1;
2385 iorder[2] = 2;
2386 iorder[3] = 3;
2387 }
2388 else if (bo == 1) {
2389 /* force BE */
2390 iorder[0] = 3;
2391 iorder[1] = 2;
2392 iorder[2] = 1;
2393 iorder[3] = 0;
2394 }
2395
2396 while (q < e) {
2397 Py_UCS4 ch;
2398 /* remaining bytes at the end? (size should be divisible by 4) */
2399 if (e-q<4) {
2400 if (consumed)
2401 break;
2402 errmsg = "truncated data";
2403 startinpos = ((const char *)q)-starts;
2404 endinpos = ((const char *)e)-starts;
2405 goto utf32Error;
2406 /* The remaining input chars are ignored if the callback
2407 chooses to skip the input */
2408 }
2409 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2410 (q[iorder[1]] << 8) | q[iorder[0]];
2411
2412 if (ch >= 0x110000)
2413 {
2414 errmsg = "codepoint not in range(0x110000)";
2415 startinpos = ((const char *)q)-starts;
2416 endinpos = startinpos+4;
2417 goto utf32Error;
2418 }
2419#ifndef Py_UNICODE_WIDE
2420 if (ch >= 0x10000)
2421 {
2422 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2423 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2424 }
2425 else
2426#endif
2427 *p++ = ch;
2428 q += 4;
2429 continue;
2430 utf32Error:
2431 outpos = p-PyUnicode_AS_UNICODE(unicode);
2432 if (unicode_decode_call_errorhandler(
2433 errors, &errorHandler,
2434 "utf32", errmsg,
2435 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2436 (PyObject **)&unicode, &outpos, &p))
2437 goto onError;
2438 }
2439
2440 if (byteorder)
2441 *byteorder = bo;
2442
2443 if (consumed)
2444 *consumed = (const char *)q-starts;
2445
2446 /* Adjust length */
2447 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2448 goto onError;
2449
2450 Py_XDECREF(errorHandler);
2451 Py_XDECREF(exc);
2452 return (PyObject *)unicode;
2453
2454onError:
2455 Py_DECREF(unicode);
2456 Py_XDECREF(errorHandler);
2457 Py_XDECREF(exc);
2458 return NULL;
2459}
2460
2461PyObject *
2462PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2463 Py_ssize_t size,
2464 const char *errors,
2465 int byteorder)
2466{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002467 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002468 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002469 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002470#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002471 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002472#else
2473 const int pairs = 0;
2474#endif
2475 /* Offsets from p for storing byte pairs in the right order. */
2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2477 int iorder[] = {0, 1, 2, 3};
2478#else
2479 int iorder[] = {3, 2, 1, 0};
2480#endif
2481
2482#define STORECHAR(CH) \
2483 do { \
2484 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2485 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2486 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2487 p[iorder[0]] = (CH) & 0xff; \
2488 p += 4; \
2489 } while(0)
2490
2491 /* In narrow builds we can output surrogate pairs as one codepoint,
2492 so we need less space. */
2493#ifndef Py_UNICODE_WIDE
2494 for (i = pairs = 0; i < size-1; i++)
2495 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2496 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2497 pairs++;
2498#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002499 nsize = (size - pairs + (byteorder == 0));
2500 bytesize = nsize * 4;
2501 if (bytesize / 4 != nsize)
2502 return PyErr_NoMemory();
2503 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002504 if (v == NULL)
2505 return NULL;
2506
Christian Heimes9c4756e2008-05-26 13:22:05 +00002507 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002508 if (byteorder == 0)
2509 STORECHAR(0xFEFF);
2510 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002511 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002512
2513 if (byteorder == -1) {
2514 /* force LE */
2515 iorder[0] = 0;
2516 iorder[1] = 1;
2517 iorder[2] = 2;
2518 iorder[3] = 3;
2519 }
2520 else if (byteorder == 1) {
2521 /* force BE */
2522 iorder[0] = 3;
2523 iorder[1] = 2;
2524 iorder[2] = 1;
2525 iorder[3] = 0;
2526 }
2527
2528 while (size-- > 0) {
2529 Py_UCS4 ch = *s++;
2530#ifndef Py_UNICODE_WIDE
2531 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2532 Py_UCS4 ch2 = *s;
2533 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2534 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2535 s++;
2536 size--;
2537 }
2538 }
2539#endif
2540 STORECHAR(ch);
2541 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002542
2543 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002544 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002545 Py_DECREF(v);
2546 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002547#undef STORECHAR
2548}
2549
2550PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2551{
2552 if (!PyUnicode_Check(unicode)) {
2553 PyErr_BadArgument();
2554 return NULL;
2555 }
2556 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2557 PyUnicode_GET_SIZE(unicode),
2558 NULL,
2559 0);
2560}
2561
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562/* --- UTF-16 Codec ------------------------------------------------------- */
2563
Tim Peters772747b2001-08-09 22:21:55 +00002564PyObject *
2565PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002566 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002567 const char *errors,
2568 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569{
Walter Dörwald69652032004-09-07 20:24:22 +00002570 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2571}
2572
2573PyObject *
2574PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002575 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002576 const char *errors,
2577 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002579{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t startinpos;
2582 Py_ssize_t endinpos;
2583 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002586 const unsigned char *q, *e;
2587 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002588 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002589 /* Offsets from q for retrieving byte pairs in the right order. */
2590#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2591 int ihi = 1, ilo = 0;
2592#else
2593 int ihi = 0, ilo = 1;
2594#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 PyObject *errorHandler = NULL;
2596 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597
2598 /* Note: size will always be longer than the resulting Unicode
2599 character count */
2600 unicode = _PyUnicode_New(size);
2601 if (!unicode)
2602 return NULL;
2603 if (size == 0)
2604 return (PyObject *)unicode;
2605
2606 /* Unpack UTF-16 encoded data */
2607 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002608 q = (unsigned char *)s;
2609 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002612 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002614 /* Check for BOM marks (U+FEFF) in the input and adjust current
2615 byte order setting accordingly. In native mode, the leading BOM
2616 mark is skipped, in all other modes, it is copied to the output
2617 stream as-is (giving a ZWNBSP character). */
2618 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002619 if (size >= 2) {
2620 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002621#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002622 if (bom == 0xFEFF) {
2623 q += 2;
2624 bo = -1;
2625 }
2626 else if (bom == 0xFFFE) {
2627 q += 2;
2628 bo = 1;
2629 }
Tim Petersced69f82003-09-16 20:30:58 +00002630#else
Walter Dörwald69652032004-09-07 20:24:22 +00002631 if (bom == 0xFEFF) {
2632 q += 2;
2633 bo = 1;
2634 }
2635 else if (bom == 0xFFFE) {
2636 q += 2;
2637 bo = -1;
2638 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002639#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002640 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642
Tim Peters772747b2001-08-09 22:21:55 +00002643 if (bo == -1) {
2644 /* force LE */
2645 ihi = 1;
2646 ilo = 0;
2647 }
2648 else if (bo == 1) {
2649 /* force BE */
2650 ihi = 0;
2651 ilo = 1;
2652 }
2653
2654 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002656 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002658 if (consumed)
2659 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 errmsg = "truncated data";
2661 startinpos = ((const char *)q)-starts;
2662 endinpos = ((const char *)e)-starts;
2663 goto utf16Error;
2664 /* The remaining input chars are ignored if the callback
2665 chooses to skip the input */
2666 }
2667 ch = (q[ihi] << 8) | q[ilo];
2668
Tim Peters772747b2001-08-09 22:21:55 +00002669 q += 2;
2670
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 if (ch < 0xD800 || ch > 0xDFFF) {
2672 *p++ = ch;
2673 continue;
2674 }
2675
2676 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002677 if (q >= e) {
2678 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 startinpos = (((const char *)q)-2)-starts;
2680 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002681 goto utf16Error;
2682 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002683 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002684 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2685 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002686 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002687#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002688 *p++ = ch;
2689 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002690#else
2691 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002692#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002693 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694 }
2695 else {
2696 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 startinpos = (((const char *)q)-4)-starts;
2698 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002699 goto utf16Error;
2700 }
2701
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002703 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 startinpos = (((const char *)q)-2)-starts;
2705 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002706 /* Fall through to report the error */
2707
2708 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 outpos = p-PyUnicode_AS_UNICODE(unicode);
2710 if (unicode_decode_call_errorhandler(
2711 errors, &errorHandler,
2712 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002713 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002715 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 }
2717
2718 if (byteorder)
2719 *byteorder = bo;
2720
Walter Dörwald69652032004-09-07 20:24:22 +00002721 if (consumed)
2722 *consumed = (const char *)q-starts;
2723
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002725 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 goto onError;
2727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return (PyObject *)unicode;
2731
2732onError:
2733 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 Py_XDECREF(errorHandler);
2735 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 return NULL;
2737}
2738
Tim Peters772747b2001-08-09 22:21:55 +00002739PyObject *
2740PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002741 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002742 const char *errors,
2743 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002745 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002746 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002747 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002748#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002749 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002750#else
2751 const int pairs = 0;
2752#endif
Tim Peters772747b2001-08-09 22:21:55 +00002753 /* Offsets from p for storing byte pairs in the right order. */
2754#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2755 int ihi = 1, ilo = 0;
2756#else
2757 int ihi = 0, ilo = 1;
2758#endif
2759
2760#define STORECHAR(CH) \
2761 do { \
2762 p[ihi] = ((CH) >> 8) & 0xff; \
2763 p[ilo] = (CH) & 0xff; \
2764 p += 2; \
2765 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002767#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002768 for (i = pairs = 0; i < size; i++)
2769 if (s[i] >= 0x10000)
2770 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002771#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002772 /* 2 * (size + pairs + (byteorder == 0)) */
2773 if (size > PY_SSIZE_T_MAX ||
2774 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2775 return PyErr_NoMemory();
2776 nsize = size + pairs + (byteorder == 0);
2777 bytesize = nsize * 2;
2778 if (bytesize / 2 != nsize)
2779 return PyErr_NoMemory();
2780 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 if (v == NULL)
2782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
Christian Heimes9c4756e2008-05-26 13:22:05 +00002784 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002786 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002787 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002788 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002789
2790 if (byteorder == -1) {
2791 /* force LE */
2792 ihi = 1;
2793 ilo = 0;
2794 }
2795 else if (byteorder == 1) {
2796 /* force BE */
2797 ihi = 0;
2798 ilo = 1;
2799 }
2800
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002801 while (size-- > 0) {
2802 Py_UNICODE ch = *s++;
2803 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002804#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002805 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002806 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2807 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002809#endif
Tim Peters772747b2001-08-09 22:21:55 +00002810 STORECHAR(ch);
2811 if (ch2)
2812 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002813 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002814
2815 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002816 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002817 Py_DECREF(v);
2818 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002819#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820}
2821
2822PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL,
2831 0);
2832}
2833
2834/* --- Unicode Escape Codec ----------------------------------------------- */
2835
Fredrik Lundh06d12682001-01-24 07:59:11 +00002836static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002839 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 const char *errors)
2841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 Py_ssize_t startinpos;
2844 Py_ssize_t endinpos;
2845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002850 char* message;
2851 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 PyObject *errorHandler = NULL;
2853 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Escaped strings will always be longer than the resulting
2856 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 length after conversion to the true value.
2858 (but if the error callback returns a long replacement string
2859 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 v = _PyUnicode_New(size);
2861 if (v == NULL)
2862 goto onError;
2863 if (size == 0)
2864 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002868
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 while (s < end) {
2870 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002871 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
2874 /* Non-escape characters are interpreted as Unicode ordinals */
2875 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002876 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 continue;
2878 }
2879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 /* \ - Escapes */
2882 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002883 c = *s++;
2884 if (s > end)
2885 c = '\0'; /* Invalid after \ */
2886 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 /* \x escapes */
2889 case '\n': break;
2890 case '\\': *p++ = '\\'; break;
2891 case '\'': *p++ = '\''; break;
2892 case '\"': *p++ = '\"'; break;
2893 case 'b': *p++ = '\b'; break;
2894 case 'f': *p++ = '\014'; break; /* FF */
2895 case 't': *p++ = '\t'; break;
2896 case 'n': *p++ = '\n'; break;
2897 case 'r': *p++ = '\r'; break;
2898 case 'v': *p++ = '\013'; break; /* VT */
2899 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2900
2901 /* \OOO (octal) escapes */
2902 case '0': case '1': case '2': case '3':
2903 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002904 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002905 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002906 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002907 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002908 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002910 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 break;
2912
Fredrik Lundhccc74732001-02-18 22:13:49 +00002913 /* hex escapes */
2914 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 digits = 2;
2917 message = "truncated \\xXX escape";
2918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002922 digits = 4;
2923 message = "truncated \\uXXXX escape";
2924 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002927 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002928 digits = 8;
2929 message = "truncated \\UXXXXXXXX escape";
2930 hexescape:
2931 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 outpos = p-PyUnicode_AS_UNICODE(v);
2933 if (s+digits>end) {
2934 endinpos = size;
2935 if (unicode_decode_call_errorhandler(
2936 errors, &errorHandler,
2937 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002938 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 (PyObject **)&v, &outpos, &p))
2940 goto onError;
2941 goto nextByte;
2942 }
2943 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002944 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002945 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 endinpos = (s+i+1)-starts;
2947 if (unicode_decode_call_errorhandler(
2948 errors, &errorHandler,
2949 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002950 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002952 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002954 }
2955 chr = (chr<<4) & ~0xF;
2956 if (c >= '0' && c <= '9')
2957 chr += c - '0';
2958 else if (c >= 'a' && c <= 'f')
2959 chr += 10 + c - 'a';
2960 else
2961 chr += 10 + c - 'A';
2962 }
2963 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002964 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 /* _decoding_error will have already written into the
2966 target buffer. */
2967 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002968 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002969 /* when we get here, chr is a 32-bit unicode character */
2970 if (chr <= 0xffff)
2971 /* UCS-2 character */
2972 *p++ = (Py_UNICODE) chr;
2973 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002974 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002975 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002976#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002977 *p++ = chr;
2978#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002979 chr -= 0x10000L;
2980 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002981 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002982#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002983 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 endinpos = s-starts;
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 if (unicode_decode_call_errorhandler(
2987 errors, &errorHandler,
2988 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002989 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002991 goto onError;
2992 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002993 break;
2994
2995 /* \N{name} */
2996 case 'N':
2997 message = "malformed \\N character escape";
2998 if (ucnhash_CAPI == NULL) {
2999 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003000 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003001 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003002 if (m == NULL)
3003 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003004 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003005 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003006 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003007 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003008 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003009 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003010 if (ucnhash_CAPI == NULL)
3011 goto ucnhashError;
3012 }
3013 if (*s == '{') {
3014 const char *start = s+1;
3015 /* look for the closing brace */
3016 while (*s != '}' && s < end)
3017 s++;
3018 if (s > start && s < end && *s == '}') {
3019 /* found a name. look it up in the unicode database */
3020 message = "unknown Unicode character name";
3021 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003022 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003023 goto store;
3024 }
3025 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 endinpos = s-starts;
3027 outpos = p-PyUnicode_AS_UNICODE(v);
3028 if (unicode_decode_call_errorhandler(
3029 errors, &errorHandler,
3030 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003031 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003033 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003034 break;
3035
3036 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003037 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 message = "\\ at end of string";
3039 s--;
3040 endinpos = s-starts;
3041 outpos = p-PyUnicode_AS_UNICODE(v);
3042 if (unicode_decode_call_errorhandler(
3043 errors, &errorHandler,
3044 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003045 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003047 goto onError;
3048 }
3049 else {
3050 *p++ = '\\';
3051 *p++ = (unsigned char)s[-1];
3052 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003053 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 nextByte:
3056 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003058 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003060 Py_XDECREF(errorHandler);
3061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003063
Fredrik Lundhccc74732001-02-18 22:13:49 +00003064ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003065 PyErr_SetString(
3066 PyExc_UnicodeError,
3067 "\\N escapes not supported (can't load unicodedata module)"
3068 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003072 return NULL;
3073
Fredrik Lundhccc74732001-02-18 22:13:49 +00003074onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 Py_XDECREF(errorHandler);
3077 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 return NULL;
3079}
3080
3081/* Return a Unicode-Escape string version of the Unicode object.
3082
3083 If quotes is true, the string is enclosed in u"" or u'' quotes as
3084 appropriate.
3085
3086*/
3087
Thomas Wouters477c8d52006-05-27 19:21:47 +00003088Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3089 Py_ssize_t size,
3090 Py_UNICODE ch)
3091{
3092 /* like wcschr, but doesn't stop at NULL characters */
3093
3094 while (size-- > 0) {
3095 if (*s == ch)
3096 return s;
3097 s++;
3098 }
3099
3100 return NULL;
3101}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003102
Walter Dörwald79e913e2007-05-12 11:08:06 +00003103static const char *hexdigits = "0123456789abcdef";
3104
3105PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3106 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003108 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003111#ifdef Py_UNICODE_WIDE
3112 const Py_ssize_t expandsize = 10;
3113#else
3114 const Py_ssize_t expandsize = 6;
3115#endif
3116
Thomas Wouters89f507f2006-12-13 04:49:30 +00003117 /* XXX(nnorwitz): rather than over-allocating, it would be
3118 better to choose a different scheme. Perhaps scan the
3119 first N-chars of the string and allocate based on that size.
3120 */
3121 /* Initial allocation is based on the longest-possible unichr
3122 escape.
3123
3124 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3125 unichr, so in this case it's the longest unichr escape. In
3126 narrow (UTF-16) builds this is five chars per source unichr
3127 since there are two unichrs in the surrogate pair, so in narrow
3128 (UTF-16) builds it's not the longest unichr escape.
3129
3130 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3131 so in the narrow (UTF-16) build case it's the longest unichr
3132 escape.
3133 */
3134
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003135 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3136 return PyErr_NoMemory();
3137
Christian Heimes9c4756e2008-05-26 13:22:05 +00003138 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003139 2
3140 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003141 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 if (repr == NULL)
3143 return NULL;
3144
Christian Heimes9c4756e2008-05-26 13:22:05 +00003145 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 while (size-- > 0) {
3148 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003149
Walter Dörwald79e913e2007-05-12 11:08:06 +00003150 /* Escape backslashes */
3151 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 *p++ = '\\';
3153 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003154 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003156
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003157#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003158 /* Map 21-bit characters to '\U00xxxxxx' */
3159 else if (ch >= 0x10000) {
3160 *p++ = '\\';
3161 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003162 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3163 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3164 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3165 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3166 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3167 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3168 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3169 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003170 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003171 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003172#else
3173 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003174 else if (ch >= 0xD800 && ch < 0xDC00) {
3175 Py_UNICODE ch2;
3176 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003177
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003178 ch2 = *s++;
3179 size--;
3180 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3181 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3182 *p++ = '\\';
3183 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003184 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3185 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3186 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3187 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3188 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3189 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3190 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3191 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003192 continue;
3193 }
3194 /* Fall through: isolated surrogates are copied as-is */
3195 s--;
3196 size++;
3197 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003198#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003199
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003201 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 *p++ = '\\';
3203 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003204 *p++ = hexdigits[(ch >> 12) & 0x000F];
3205 *p++ = hexdigits[(ch >> 8) & 0x000F];
3206 *p++ = hexdigits[(ch >> 4) & 0x000F];
3207 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003209
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003210 /* Map special whitespace to '\t', \n', '\r' */
3211 else if (ch == '\t') {
3212 *p++ = '\\';
3213 *p++ = 't';
3214 }
3215 else if (ch == '\n') {
3216 *p++ = '\\';
3217 *p++ = 'n';
3218 }
3219 else if (ch == '\r') {
3220 *p++ = '\\';
3221 *p++ = 'r';
3222 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003223
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003224 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003225 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003227 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003228 *p++ = hexdigits[(ch >> 4) & 0x000F];
3229 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003230 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003231
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 /* Copy everything else as-is */
3233 else
3234 *p++ = (char) ch;
3235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236
Christian Heimes72b710a2008-05-26 13:28:38 +00003237 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003238 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003239 Py_DECREF(repr);
3240 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241}
3242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3244{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003245 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 if (!PyUnicode_Check(unicode)) {
3247 PyErr_BadArgument();
3248 return NULL;
3249 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003250 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3251 PyUnicode_GET_SIZE(unicode));
3252
3253 if (!s)
3254 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003255 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003256 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003257 Py_DECREF(s);
3258 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259}
3260
3261/* --- Raw Unicode Escape Codec ------------------------------------------- */
3262
3263PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 const char *errors)
3266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003268 Py_ssize_t startinpos;
3269 Py_ssize_t endinpos;
3270 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 const char *end;
3274 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 PyObject *errorHandler = NULL;
3276 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 /* Escaped strings will always be longer than the resulting
3279 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 length after conversion to the true value. (But decoding error
3281 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 v = _PyUnicode_New(size);
3283 if (v == NULL)
3284 goto onError;
3285 if (size == 0)
3286 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 end = s + size;
3289 while (s < end) {
3290 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003291 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294
3295 /* Non-escape characters are interpreted as Unicode ordinals */
3296 if (*s != '\\') {
3297 *p++ = (unsigned char)*s++;
3298 continue;
3299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301
3302 /* \u-escapes are only interpreted iff the number of leading
3303 backslashes if odd */
3304 bs = s;
3305 for (;s < end;) {
3306 if (*s != '\\')
3307 break;
3308 *p++ = (unsigned char)*s++;
3309 }
3310 if (((s - bs) & 1) == 0 ||
3311 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003312 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 continue;
3314 }
3315 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003316 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 s++;
3318
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003319 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003321 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003323 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 endinpos = s-starts;
3325 if (unicode_decode_call_errorhandler(
3326 errors, &errorHandler,
3327 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003328 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 }
3333 x = (x<<4) & ~0xF;
3334 if (c >= '0' && c <= '9')
3335 x += c - '0';
3336 else if (c >= 'a' && c <= 'f')
3337 x += 10 + c - 'a';
3338 else
3339 x += 10 + c - 'A';
3340 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003341 if (x <= 0xffff)
3342 /* UCS-2 character */
3343 *p++ = (Py_UNICODE) x;
3344 else if (x <= 0x10ffff) {
3345 /* UCS-4 character. Either store directly, or as
3346 surrogate pair. */
3347#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003348 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003349#else
3350 x -= 0x10000L;
3351 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3352 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3353#endif
3354 } else {
3355 endinpos = s-starts;
3356 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003357 if (unicode_decode_call_errorhandler(
3358 errors, &errorHandler,
3359 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003360 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003361 (PyObject **)&v, &outpos, &p))
3362 goto onError;
3363 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 nextByte:
3365 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003367 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003368 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 Py_XDECREF(errorHandler);
3370 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 onError:
3374 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 Py_XDECREF(errorHandler);
3376 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 return NULL;
3378}
3379
3380PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003383 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 char *p;
3385 char *q;
3386
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003387#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003388 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003389#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003390 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003391#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003392
3393 if (size > PY_SSIZE_T_MAX / expandsize)
3394 return PyErr_NoMemory();
3395
3396 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (repr == NULL)
3398 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003399 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003400 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
Christian Heimes9c4756e2008-05-26 13:22:05 +00003402 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 while (size-- > 0) {
3404 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003405#ifdef Py_UNICODE_WIDE
3406 /* Map 32-bit characters to '\Uxxxxxxxx' */
3407 if (ch >= 0x10000) {
3408 *p++ = '\\';
3409 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003410 *p++ = hexdigits[(ch >> 28) & 0xf];
3411 *p++ = hexdigits[(ch >> 24) & 0xf];
3412 *p++ = hexdigits[(ch >> 20) & 0xf];
3413 *p++ = hexdigits[(ch >> 16) & 0xf];
3414 *p++ = hexdigits[(ch >> 12) & 0xf];
3415 *p++ = hexdigits[(ch >> 8) & 0xf];
3416 *p++ = hexdigits[(ch >> 4) & 0xf];
3417 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003418 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003419 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003420#else
3421 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3422 if (ch >= 0xD800 && ch < 0xDC00) {
3423 Py_UNICODE ch2;
3424 Py_UCS4 ucs;
3425
3426 ch2 = *s++;
3427 size--;
3428 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3429 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3430 *p++ = '\\';
3431 *p++ = 'U';
3432 *p++ = hexdigits[(ucs >> 28) & 0xf];
3433 *p++ = hexdigits[(ucs >> 24) & 0xf];
3434 *p++ = hexdigits[(ucs >> 20) & 0xf];
3435 *p++ = hexdigits[(ucs >> 16) & 0xf];
3436 *p++ = hexdigits[(ucs >> 12) & 0xf];
3437 *p++ = hexdigits[(ucs >> 8) & 0xf];
3438 *p++ = hexdigits[(ucs >> 4) & 0xf];
3439 *p++ = hexdigits[ucs & 0xf];
3440 continue;
3441 }
3442 /* Fall through: isolated surrogates are copied as-is */
3443 s--;
3444 size++;
3445 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003446#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 /* Map 16-bit characters to '\uxxxx' */
3448 if (ch >= 256) {
3449 *p++ = '\\';
3450 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003451 *p++ = hexdigits[(ch >> 12) & 0xf];
3452 *p++ = hexdigits[(ch >> 8) & 0xf];
3453 *p++ = hexdigits[(ch >> 4) & 0xf];
3454 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
3456 /* Copy everything else as-is */
3457 else
3458 *p++ = (char) ch;
3459 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003460 size = p - q;
3461
3462 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003463 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003464 Py_DECREF(repr);
3465 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466}
3467
3468PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3469{
Walter Dörwald711005d2007-05-12 12:03:26 +00003470 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003472 PyErr_BadArgument();
3473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003475 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3476 PyUnicode_GET_SIZE(unicode));
3477
3478 if (!s)
3479 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003480 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003481 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003482 Py_DECREF(s);
3483 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484}
3485
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003486/* --- Unicode Internal Codec ------------------------------------------- */
3487
3488PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003489 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003490 const char *errors)
3491{
3492 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003493 Py_ssize_t startinpos;
3494 Py_ssize_t endinpos;
3495 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003496 PyUnicodeObject *v;
3497 Py_UNICODE *p;
3498 const char *end;
3499 const char *reason;
3500 PyObject *errorHandler = NULL;
3501 PyObject *exc = NULL;
3502
Neal Norwitzd43069c2006-01-08 01:12:10 +00003503#ifdef Py_UNICODE_WIDE
3504 Py_UNICODE unimax = PyUnicode_GetMax();
3505#endif
3506
Thomas Wouters89f507f2006-12-13 04:49:30 +00003507 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003508 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3509 if (v == NULL)
3510 goto onError;
3511 if (PyUnicode_GetSize((PyObject *)v) == 0)
3512 return (PyObject *)v;
3513 p = PyUnicode_AS_UNICODE(v);
3514 end = s + size;
3515
3516 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003517 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003518 /* We have to sanity check the raw data, otherwise doom looms for
3519 some malformed UCS-4 data. */
3520 if (
3521 #ifdef Py_UNICODE_WIDE
3522 *p > unimax || *p < 0 ||
3523 #endif
3524 end-s < Py_UNICODE_SIZE
3525 )
3526 {
3527 startinpos = s - starts;
3528 if (end-s < Py_UNICODE_SIZE) {
3529 endinpos = end-starts;
3530 reason = "truncated input";
3531 }
3532 else {
3533 endinpos = s - starts + Py_UNICODE_SIZE;
3534 reason = "illegal code point (> 0x10FFFF)";
3535 }
3536 outpos = p - PyUnicode_AS_UNICODE(v);
3537 if (unicode_decode_call_errorhandler(
3538 errors, &errorHandler,
3539 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003540 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003541 (PyObject **)&v, &outpos, &p)) {
3542 goto onError;
3543 }
3544 }
3545 else {
3546 p++;
3547 s += Py_UNICODE_SIZE;
3548 }
3549 }
3550
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003551 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003552 goto onError;
3553 Py_XDECREF(errorHandler);
3554 Py_XDECREF(exc);
3555 return (PyObject *)v;
3556
3557 onError:
3558 Py_XDECREF(v);
3559 Py_XDECREF(errorHandler);
3560 Py_XDECREF(exc);
3561 return NULL;
3562}
3563
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564/* --- Latin-1 Codec ------------------------------------------------------ */
3565
3566PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 const char *errors)
3569{
3570 PyUnicodeObject *v;
3571 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003572
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003574 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003575 Py_UNICODE r = *(unsigned char*)s;
3576 return PyUnicode_FromUnicode(&r, 1);
3577 }
3578
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 v = _PyUnicode_New(size);
3580 if (v == NULL)
3581 goto onError;
3582 if (size == 0)
3583 return (PyObject *)v;
3584 p = PyUnicode_AS_UNICODE(v);
3585 while (size-- > 0)
3586 *p++ = (unsigned char)*s++;
3587 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 onError:
3590 Py_XDECREF(v);
3591 return NULL;
3592}
3593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594/* create or adjust a UnicodeEncodeError */
3595static void make_encode_exception(PyObject **exceptionObject,
3596 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 const Py_UNICODE *unicode, Py_ssize_t size,
3598 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 if (*exceptionObject == NULL) {
3602 *exceptionObject = PyUnicodeEncodeError_Create(
3603 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 }
3605 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3607 goto onError;
3608 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3609 goto onError;
3610 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3611 goto onError;
3612 return;
3613 onError:
3614 Py_DECREF(*exceptionObject);
3615 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 }
3617}
3618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619/* raises a UnicodeEncodeError */
3620static void raise_encode_exception(PyObject **exceptionObject,
3621 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003622 const Py_UNICODE *unicode, Py_ssize_t size,
3623 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 const char *reason)
3625{
3626 make_encode_exception(exceptionObject,
3627 encoding, unicode, size, startpos, endpos, reason);
3628 if (*exceptionObject != NULL)
3629 PyCodec_StrictErrors(*exceptionObject);
3630}
3631
3632/* error handling callback helper:
3633 build arguments, call the callback and check the arguments,
3634 put the result into newpos and return the replacement string, which
3635 has to be freed by the caller */
3636static PyObject *unicode_encode_call_errorhandler(const char *errors,
3637 PyObject **errorHandler,
3638 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003639 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3640 Py_ssize_t startpos, Py_ssize_t endpos,
3641 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003643 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644
3645 PyObject *restuple;
3646 PyObject *resunicode;
3647
3648 if (*errorHandler == NULL) {
3649 *errorHandler = PyCodec_LookupError(errors);
3650 if (*errorHandler == NULL)
3651 return NULL;
3652 }
3653
3654 make_encode_exception(exceptionObject,
3655 encoding, unicode, size, startpos, endpos, reason);
3656 if (*exceptionObject == NULL)
3657 return NULL;
3658
3659 restuple = PyObject_CallFunctionObjArgs(
3660 *errorHandler, *exceptionObject, NULL);
3661 if (restuple == NULL)
3662 return NULL;
3663 if (!PyTuple_Check(restuple)) {
3664 PyErr_Format(PyExc_TypeError, &argparse[4]);
3665 Py_DECREF(restuple);
3666 return NULL;
3667 }
3668 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3669 &resunicode, newpos)) {
3670 Py_DECREF(restuple);
3671 return NULL;
3672 }
3673 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003674 *newpos = size+*newpos;
3675 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003676 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003677 Py_DECREF(restuple);
3678 return NULL;
3679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 Py_INCREF(resunicode);
3681 Py_DECREF(restuple);
3682 return resunicode;
3683}
3684
3685static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003686 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 const char *errors,
3688 int limit)
3689{
3690 /* output object */
3691 PyObject *res;
3692 /* pointers to the beginning and end+1 of input */
3693 const Py_UNICODE *startp = p;
3694 const Py_UNICODE *endp = p + size;
3695 /* pointer to the beginning of the unencodable characters */
3696 /* const Py_UNICODE *badp = NULL; */
3697 /* pointer into the output */
3698 char *str;
3699 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003700 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003701 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3702 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 PyObject *errorHandler = NULL;
3704 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003705 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 /* the following variable is used for caching string comparisons
3707 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3708 int known_errorHandler = -1;
3709
3710 /* allocate enough for a simple encoding without
3711 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003712 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003713 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003714 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003716 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003717 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 ressize = size;
3719
3720 while (p<endp) {
3721 Py_UNICODE c = *p;
3722
3723 /* can we encode this? */
3724 if (c<limit) {
3725 /* no overflow check, because we know that the space is enough */
3726 *str++ = (char)c;
3727 ++p;
3728 }
3729 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003730 Py_ssize_t unicodepos = p-startp;
3731 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003733 Py_ssize_t repsize;
3734 Py_ssize_t newpos;
3735 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 Py_UNICODE *uni2;
3737 /* startpos for collecting unencodable chars */
3738 const Py_UNICODE *collstart = p;
3739 const Py_UNICODE *collend = p;
3740 /* find all unecodable characters */
3741 while ((collend < endp) && ((*collend)>=limit))
3742 ++collend;
3743 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3744 if (known_errorHandler==-1) {
3745 if ((errors==NULL) || (!strcmp(errors, "strict")))
3746 known_errorHandler = 1;
3747 else if (!strcmp(errors, "replace"))
3748 known_errorHandler = 2;
3749 else if (!strcmp(errors, "ignore"))
3750 known_errorHandler = 3;
3751 else if (!strcmp(errors, "xmlcharrefreplace"))
3752 known_errorHandler = 4;
3753 else
3754 known_errorHandler = 0;
3755 }
3756 switch (known_errorHandler) {
3757 case 1: /* strict */
3758 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3759 goto onError;
3760 case 2: /* replace */
3761 while (collstart++<collend)
3762 *str++ = '?'; /* fall through */
3763 case 3: /* ignore */
3764 p = collend;
3765 break;
3766 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003767 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 /* determine replacement size (temporarily (mis)uses p) */
3769 for (p = collstart, repsize = 0; p < collend; ++p) {
3770 if (*p<10)
3771 repsize += 2+1+1;
3772 else if (*p<100)
3773 repsize += 2+2+1;
3774 else if (*p<1000)
3775 repsize += 2+3+1;
3776 else if (*p<10000)
3777 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003778#ifndef Py_UNICODE_WIDE
3779 else
3780 repsize += 2+5+1;
3781#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 else if (*p<100000)
3783 repsize += 2+5+1;
3784 else if (*p<1000000)
3785 repsize += 2+6+1;
3786 else
3787 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003788#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 }
3790 requiredsize = respos+repsize+(endp-collend);
3791 if (requiredsize > ressize) {
3792 if (requiredsize<2*ressize)
3793 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003794 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003796 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 ressize = requiredsize;
3798 }
3799 /* generate replacement (temporarily (mis)uses p) */
3800 for (p = collstart; p < collend; ++p) {
3801 str += sprintf(str, "&#%d;", (int)*p);
3802 }
3803 p = collend;
3804 break;
3805 default:
3806 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3807 encoding, reason, startp, size, &exc,
3808 collstart-startp, collend-startp, &newpos);
3809 if (repunicode == NULL)
3810 goto onError;
3811 /* need more space? (at least enough for what we
3812 have+the replacement+the rest of the string, so
3813 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003814 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 repsize = PyUnicode_GET_SIZE(repunicode);
3816 requiredsize = respos+repsize+(endp-collend);
3817 if (requiredsize > ressize) {
3818 if (requiredsize<2*ressize)
3819 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003820 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_DECREF(repunicode);
3822 goto onError;
3823 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003824 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 ressize = requiredsize;
3826 }
3827 /* check if there is anything unencodable in the replacement
3828 and copy it to the output */
3829 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3830 c = *uni2;
3831 if (c >= limit) {
3832 raise_encode_exception(&exc, encoding, startp, size,
3833 unicodepos, unicodepos+1, reason);
3834 Py_DECREF(repunicode);
3835 goto onError;
3836 }
3837 *str = (char)c;
3838 }
3839 p = startp + newpos;
3840 Py_DECREF(repunicode);
3841 }
3842 }
3843 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003844 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003845 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003846 onError:
3847 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 Py_XDECREF(errorHandler);
3849 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003850 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851}
3852
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 const char *errors)
3856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858}
3859
3860PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3861{
3862 if (!PyUnicode_Check(unicode)) {
3863 PyErr_BadArgument();
3864 return NULL;
3865 }
3866 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3867 PyUnicode_GET_SIZE(unicode),
3868 NULL);
3869}
3870
3871/* --- 7-bit ASCII Codec -------------------------------------------------- */
3872
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875 const char *errors)
3876{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 PyUnicodeObject *v;
3879 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003880 Py_ssize_t startinpos;
3881 Py_ssize_t endinpos;
3882 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 const char *e;
3884 PyObject *errorHandler = NULL;
3885 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003888 if (size == 1 && *(unsigned char*)s < 128) {
3889 Py_UNICODE r = *(unsigned char*)s;
3890 return PyUnicode_FromUnicode(&r, 1);
3891 }
Tim Petersced69f82003-09-16 20:30:58 +00003892
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 v = _PyUnicode_New(size);
3894 if (v == NULL)
3895 goto onError;
3896 if (size == 0)
3897 return (PyObject *)v;
3898 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 e = s + size;
3900 while (s < e) {
3901 register unsigned char c = (unsigned char)*s;
3902 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 ++s;
3905 }
3906 else {
3907 startinpos = s-starts;
3908 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003909 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003918 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003919 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003920 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 Py_XDECREF(errorHandler);
3922 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003924
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 onError:
3926 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 Py_XDECREF(errorHandler);
3928 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 return NULL;
3930}
3931
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 const char *errors)
3935{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937}
3938
3939PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3940{
3941 if (!PyUnicode_Check(unicode)) {
3942 PyErr_BadArgument();
3943 return NULL;
3944 }
3945 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3946 PyUnicode_GET_SIZE(unicode),
3947 NULL);
3948}
3949
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003950#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003951
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003952/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003954#if SIZEOF_INT < SIZEOF_SSIZE_T
3955#define NEED_RETRY
3956#endif
3957
3958/* XXX This code is limited to "true" double-byte encodings, as
3959 a) it assumes an incomplete character consists of a single byte, and
3960 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3961 encodings, see IsDBCSLeadByteEx documentation. */
3962
3963static int is_dbcs_lead_byte(const char *s, int offset)
3964{
3965 const char *curr = s + offset;
3966
3967 if (IsDBCSLeadByte(*curr)) {
3968 const char *prev = CharPrev(s, curr);
3969 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3970 }
3971 return 0;
3972}
3973
3974/*
3975 * Decode MBCS string into unicode object. If 'final' is set, converts
3976 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3977 */
3978static int decode_mbcs(PyUnicodeObject **v,
3979 const char *s, /* MBCS string */
3980 int size, /* sizeof MBCS string */
3981 int final)
3982{
3983 Py_UNICODE *p;
3984 Py_ssize_t n = 0;
3985 int usize = 0;
3986
3987 assert(size >= 0);
3988
3989 /* Skip trailing lead-byte unless 'final' is set */
3990 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3991 --size;
3992
3993 /* First get the size of the result */
3994 if (size > 0) {
3995 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3996 if (usize == 0) {
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998 return -1;
3999 }
4000 }
4001
4002 if (*v == NULL) {
4003 /* Create unicode object */
4004 *v = _PyUnicode_New(usize);
4005 if (*v == NULL)
4006 return -1;
4007 }
4008 else {
4009 /* Extend unicode object */
4010 n = PyUnicode_GET_SIZE(*v);
4011 if (_PyUnicode_Resize(v, n + usize) < 0)
4012 return -1;
4013 }
4014
4015 /* Do the conversion */
4016 if (size > 0) {
4017 p = PyUnicode_AS_UNICODE(*v) + n;
4018 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4019 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4020 return -1;
4021 }
4022 }
4023
4024 return size;
4025}
4026
4027PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4028 Py_ssize_t size,
4029 const char *errors,
4030 Py_ssize_t *consumed)
4031{
4032 PyUnicodeObject *v = NULL;
4033 int done;
4034
4035 if (consumed)
4036 *consumed = 0;
4037
4038#ifdef NEED_RETRY
4039 retry:
4040 if (size > INT_MAX)
4041 done = decode_mbcs(&v, s, INT_MAX, 0);
4042 else
4043#endif
4044 done = decode_mbcs(&v, s, (int)size, !consumed);
4045
4046 if (done < 0) {
4047 Py_XDECREF(v);
4048 return NULL;
4049 }
4050
4051 if (consumed)
4052 *consumed += done;
4053
4054#ifdef NEED_RETRY
4055 if (size > INT_MAX) {
4056 s += done;
4057 size -= done;
4058 goto retry;
4059 }
4060#endif
4061
4062 return (PyObject *)v;
4063}
4064
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004065PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004067 const char *errors)
4068{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004069 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4070}
4071
4072/*
4073 * Convert unicode into string object (MBCS).
4074 * Returns 0 if succeed, -1 otherwise.
4075 */
4076static int encode_mbcs(PyObject **repr,
4077 const Py_UNICODE *p, /* unicode */
4078 int size) /* size of unicode */
4079{
4080 int mbcssize = 0;
4081 Py_ssize_t n = 0;
4082
4083 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004084
4085 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004086 if (size > 0) {
4087 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4088 if (mbcssize == 0) {
4089 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4090 return -1;
4091 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004092 }
4093
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004094 if (*repr == NULL) {
4095 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004096 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004097 if (*repr == NULL)
4098 return -1;
4099 }
4100 else {
4101 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004102 n = PyBytes_Size(*repr);
4103 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004104 return -1;
4105 }
4106
4107 /* Do the conversion */
4108 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004109 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004110 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4111 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4112 return -1;
4113 }
4114 }
4115
4116 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004117}
4118
4119PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004121 const char *errors)
4122{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004123 PyObject *repr = NULL;
4124 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004125
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004126#ifdef NEED_RETRY
4127 retry:
4128 if (size > INT_MAX)
4129 ret = encode_mbcs(&repr, p, INT_MAX);
4130 else
4131#endif
4132 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004133
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004134 if (ret < 0) {
4135 Py_XDECREF(repr);
4136 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004137 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004138
4139#ifdef NEED_RETRY
4140 if (size > INT_MAX) {
4141 p += INT_MAX;
4142 size -= INT_MAX;
4143 goto retry;
4144 }
4145#endif
4146
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004147 return repr;
4148}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004149
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004150PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4151{
4152 if (!PyUnicode_Check(unicode)) {
4153 PyErr_BadArgument();
4154 return NULL;
4155 }
4156 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4157 PyUnicode_GET_SIZE(unicode),
4158 NULL);
4159}
4160
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004161#undef NEED_RETRY
4162
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004163#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004164
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165/* --- Character Mapping Codec -------------------------------------------- */
4166
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004168 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 PyObject *mapping,
4170 const char *errors)
4171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004173 Py_ssize_t startinpos;
4174 Py_ssize_t endinpos;
4175 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 PyUnicodeObject *v;
4178 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004179 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 PyObject *errorHandler = NULL;
4181 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004182 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004183 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004184
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 /* Default to Latin-1 */
4186 if (mapping == NULL)
4187 return PyUnicode_DecodeLatin1(s, size, errors);
4188
4189 v = _PyUnicode_New(size);
4190 if (v == NULL)
4191 goto onError;
4192 if (size == 0)
4193 return (PyObject *)v;
4194 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004196 if (PyUnicode_CheckExact(mapping)) {
4197 mapstring = PyUnicode_AS_UNICODE(mapping);
4198 maplen = PyUnicode_GET_SIZE(mapping);
4199 while (s < e) {
4200 unsigned char ch = *s;
4201 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004203 if (ch < maplen)
4204 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004206 if (x == 0xfffe) {
4207 /* undefined mapping */
4208 outpos = p-PyUnicode_AS_UNICODE(v);
4209 startinpos = s-starts;
4210 endinpos = startinpos+1;
4211 if (unicode_decode_call_errorhandler(
4212 errors, &errorHandler,
4213 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004214 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004215 (PyObject **)&v, &outpos, &p)) {
4216 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004217 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004218 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004219 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004220 *p++ = x;
4221 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004223 }
4224 else {
4225 while (s < e) {
4226 unsigned char ch = *s;
4227 PyObject *w, *x;
4228
4229 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004230 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 if (w == NULL)
4232 goto onError;
4233 x = PyObject_GetItem(mapping, w);
4234 Py_DECREF(w);
4235 if (x == NULL) {
4236 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4237 /* No mapping found means: mapping is undefined. */
4238 PyErr_Clear();
4239 x = Py_None;
4240 Py_INCREF(x);
4241 } else
4242 goto onError;
4243 }
4244
4245 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004246 if (PyLong_Check(x)) {
4247 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004248 if (value < 0 || value > 65535) {
4249 PyErr_SetString(PyExc_TypeError,
4250 "character mapping must be in range(65536)");
4251 Py_DECREF(x);
4252 goto onError;
4253 }
4254 *p++ = (Py_UNICODE)value;
4255 }
4256 else if (x == Py_None) {
4257 /* undefined mapping */
4258 outpos = p-PyUnicode_AS_UNICODE(v);
4259 startinpos = s-starts;
4260 endinpos = startinpos+1;
4261 if (unicode_decode_call_errorhandler(
4262 errors, &errorHandler,
4263 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004265 (PyObject **)&v, &outpos, &p)) {
4266 Py_DECREF(x);
4267 goto onError;
4268 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004269 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004270 continue;
4271 }
4272 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004274
4275 if (targetsize == 1)
4276 /* 1-1 mapping */
4277 *p++ = *PyUnicode_AS_UNICODE(x);
4278
4279 else if (targetsize > 1) {
4280 /* 1-n mapping */
4281 if (targetsize > extrachars) {
4282 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4284 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004285 (targetsize << 2);
4286 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004287 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004288 if (_PyUnicode_Resize(&v,
4289 PyUnicode_GET_SIZE(v) + needed) < 0) {
4290 Py_DECREF(x);
4291 goto onError;
4292 }
4293 p = PyUnicode_AS_UNICODE(v) + oldpos;
4294 }
4295 Py_UNICODE_COPY(p,
4296 PyUnicode_AS_UNICODE(x),
4297 targetsize);
4298 p += targetsize;
4299 extrachars -= targetsize;
4300 }
4301 /* 1-0 mapping: skip the character */
4302 }
4303 else {
4304 /* wrong return value */
4305 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004306 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004307 Py_DECREF(x);
4308 goto onError;
4309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004311 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 }
4314 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004315 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 Py_XDECREF(errorHandler);
4318 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004320
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 Py_XDECREF(errorHandler);
4323 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 Py_XDECREF(v);
4325 return NULL;
4326}
4327
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004328/* Charmap encoding: the lookup table */
4329
4330struct encoding_map{
4331 PyObject_HEAD
4332 unsigned char level1[32];
4333 int count2, count3;
4334 unsigned char level23[1];
4335};
4336
4337static PyObject*
4338encoding_map_size(PyObject *obj, PyObject* args)
4339{
4340 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004341 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004342 128*map->count3);
4343}
4344
4345static PyMethodDef encoding_map_methods[] = {
4346 {"size", encoding_map_size, METH_NOARGS,
4347 PyDoc_STR("Return the size (in bytes) of this object") },
4348 { 0 }
4349};
4350
4351static void
4352encoding_map_dealloc(PyObject* o)
4353{
4354 PyObject_FREE(o);
4355}
4356
4357static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004358 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004359 "EncodingMap", /*tp_name*/
4360 sizeof(struct encoding_map), /*tp_basicsize*/
4361 0, /*tp_itemsize*/
4362 /* methods */
4363 encoding_map_dealloc, /*tp_dealloc*/
4364 0, /*tp_print*/
4365 0, /*tp_getattr*/
4366 0, /*tp_setattr*/
4367 0, /*tp_compare*/
4368 0, /*tp_repr*/
4369 0, /*tp_as_number*/
4370 0, /*tp_as_sequence*/
4371 0, /*tp_as_mapping*/
4372 0, /*tp_hash*/
4373 0, /*tp_call*/
4374 0, /*tp_str*/
4375 0, /*tp_getattro*/
4376 0, /*tp_setattro*/
4377 0, /*tp_as_buffer*/
4378 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4379 0, /*tp_doc*/
4380 0, /*tp_traverse*/
4381 0, /*tp_clear*/
4382 0, /*tp_richcompare*/
4383 0, /*tp_weaklistoffset*/
4384 0, /*tp_iter*/
4385 0, /*tp_iternext*/
4386 encoding_map_methods, /*tp_methods*/
4387 0, /*tp_members*/
4388 0, /*tp_getset*/
4389 0, /*tp_base*/
4390 0, /*tp_dict*/
4391 0, /*tp_descr_get*/
4392 0, /*tp_descr_set*/
4393 0, /*tp_dictoffset*/
4394 0, /*tp_init*/
4395 0, /*tp_alloc*/
4396 0, /*tp_new*/
4397 0, /*tp_free*/
4398 0, /*tp_is_gc*/
4399};
4400
4401PyObject*
4402PyUnicode_BuildEncodingMap(PyObject* string)
4403{
4404 Py_UNICODE *decode;
4405 PyObject *result;
4406 struct encoding_map *mresult;
4407 int i;
4408 int need_dict = 0;
4409 unsigned char level1[32];
4410 unsigned char level2[512];
4411 unsigned char *mlevel1, *mlevel2, *mlevel3;
4412 int count2 = 0, count3 = 0;
4413
4414 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4415 PyErr_BadArgument();
4416 return NULL;
4417 }
4418 decode = PyUnicode_AS_UNICODE(string);
4419 memset(level1, 0xFF, sizeof level1);
4420 memset(level2, 0xFF, sizeof level2);
4421
4422 /* If there isn't a one-to-one mapping of NULL to \0,
4423 or if there are non-BMP characters, we need to use
4424 a mapping dictionary. */
4425 if (decode[0] != 0)
4426 need_dict = 1;
4427 for (i = 1; i < 256; i++) {
4428 int l1, l2;
4429 if (decode[i] == 0
4430 #ifdef Py_UNICODE_WIDE
4431 || decode[i] > 0xFFFF
4432 #endif
4433 ) {
4434 need_dict = 1;
4435 break;
4436 }
4437 if (decode[i] == 0xFFFE)
4438 /* unmapped character */
4439 continue;
4440 l1 = decode[i] >> 11;
4441 l2 = decode[i] >> 7;
4442 if (level1[l1] == 0xFF)
4443 level1[l1] = count2++;
4444 if (level2[l2] == 0xFF)
4445 level2[l2] = count3++;
4446 }
4447
4448 if (count2 >= 0xFF || count3 >= 0xFF)
4449 need_dict = 1;
4450
4451 if (need_dict) {
4452 PyObject *result = PyDict_New();
4453 PyObject *key, *value;
4454 if (!result)
4455 return NULL;
4456 for (i = 0; i < 256; i++) {
4457 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004458 key = PyLong_FromLong(decode[i]);
4459 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004460 if (!key || !value)
4461 goto failed1;
4462 if (PyDict_SetItem(result, key, value) == -1)
4463 goto failed1;
4464 Py_DECREF(key);
4465 Py_DECREF(value);
4466 }
4467 return result;
4468 failed1:
4469 Py_XDECREF(key);
4470 Py_XDECREF(value);
4471 Py_DECREF(result);
4472 return NULL;
4473 }
4474
4475 /* Create a three-level trie */
4476 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4477 16*count2 + 128*count3 - 1);
4478 if (!result)
4479 return PyErr_NoMemory();
4480 PyObject_Init(result, &EncodingMapType);
4481 mresult = (struct encoding_map*)result;
4482 mresult->count2 = count2;
4483 mresult->count3 = count3;
4484 mlevel1 = mresult->level1;
4485 mlevel2 = mresult->level23;
4486 mlevel3 = mresult->level23 + 16*count2;
4487 memcpy(mlevel1, level1, 32);
4488 memset(mlevel2, 0xFF, 16*count2);
4489 memset(mlevel3, 0, 128*count3);
4490 count3 = 0;
4491 for (i = 1; i < 256; i++) {
4492 int o1, o2, o3, i2, i3;
4493 if (decode[i] == 0xFFFE)
4494 /* unmapped character */
4495 continue;
4496 o1 = decode[i]>>11;
4497 o2 = (decode[i]>>7) & 0xF;
4498 i2 = 16*mlevel1[o1] + o2;
4499 if (mlevel2[i2] == 0xFF)
4500 mlevel2[i2] = count3++;
4501 o3 = decode[i] & 0x7F;
4502 i3 = 128*mlevel2[i2] + o3;
4503 mlevel3[i3] = i;
4504 }
4505 return result;
4506}
4507
4508static int
4509encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4510{
4511 struct encoding_map *map = (struct encoding_map*)mapping;
4512 int l1 = c>>11;
4513 int l2 = (c>>7) & 0xF;
4514 int l3 = c & 0x7F;
4515 int i;
4516
4517#ifdef Py_UNICODE_WIDE
4518 if (c > 0xFFFF) {
4519 return -1;
4520 }
4521#endif
4522 if (c == 0)
4523 return 0;
4524 /* level 1*/
4525 i = map->level1[l1];
4526 if (i == 0xFF) {
4527 return -1;
4528 }
4529 /* level 2*/
4530 i = map->level23[16*i+l2];
4531 if (i == 0xFF) {
4532 return -1;
4533 }
4534 /* level 3 */
4535 i = map->level23[16*map->count2 + 128*i + l3];
4536 if (i == 0) {
4537 return -1;
4538 }
4539 return i;
4540}
4541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542/* Lookup the character ch in the mapping. If the character
4543 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004544 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546{
Christian Heimes217cfd12007-12-02 14:31:20 +00004547 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *x;
4549
4550 if (w == NULL)
4551 return NULL;
4552 x = PyObject_GetItem(mapping, w);
4553 Py_DECREF(w);
4554 if (x == NULL) {
4555 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4556 /* No mapping found means: mapping is undefined. */
4557 PyErr_Clear();
4558 x = Py_None;
4559 Py_INCREF(x);
4560 return x;
4561 } else
4562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004564 else if (x == Py_None)
4565 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004566 else if (PyLong_Check(x)) {
4567 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 if (value < 0 || value > 255) {
4569 PyErr_SetString(PyExc_TypeError,
4570 "character mapping must be in range(256)");
4571 Py_DECREF(x);
4572 return NULL;
4573 }
4574 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004576 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004580 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004581 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004582 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_DECREF(x);
4584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 }
4586}
4587
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004588static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004589charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004590{
Christian Heimes72b710a2008-05-26 13:28:38 +00004591 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004592 /* exponentially overallocate to minimize reallocations */
4593 if (requiredsize < 2*outsize)
4594 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004595 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004596 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004597 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004598}
4599
4600typedef enum charmapencode_result {
4601 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4602}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004604 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 space is available. Return a new reference to the object that
4606 was put in the output buffer, or Py_None, if the mapping was undefined
4607 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004608 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004610charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004611 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004613 PyObject *rep;
4614 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004615 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616
Christian Heimes90aa7642007-12-19 02:45:37 +00004617 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004618 int res = encoding_map_lookup(c, mapping);
4619 Py_ssize_t requiredsize = *outpos+1;
4620 if (res == -1)
4621 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004623 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004624 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004625 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004626 outstart[(*outpos)++] = (char)res;
4627 return enc_SUCCESS;
4628 }
4629
4630 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004632 return enc_EXCEPTION;
4633 else if (rep==Py_None) {
4634 Py_DECREF(rep);
4635 return enc_FAILED;
4636 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004637 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004639 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004640 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004642 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004644 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004645 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 }
4647 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004648 const char *repchars = PyBytes_AS_STRING(rep);
4649 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004651 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004652 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004654 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004656 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 memcpy(outstart + *outpos, repchars, repsize);
4658 *outpos += repsize;
4659 }
4660 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004661 Py_DECREF(rep);
4662 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663}
4664
4665/* handle an error in PyUnicode_EncodeCharmap
4666 Return 0 on success, -1 on error */
4667static
4668int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004671 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004672 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673{
4674 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004675 Py_ssize_t repsize;
4676 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 Py_UNICODE *uni2;
4678 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t collstartpos = *inpos;
4680 Py_ssize_t collendpos = *inpos+1;
4681 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 char *encoding = "charmap";
4683 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004684 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 /* find all unencodable characters */
4687 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004688 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004689 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004690 int res = encoding_map_lookup(p[collendpos], mapping);
4691 if (res != -1)
4692 break;
4693 ++collendpos;
4694 continue;
4695 }
4696
4697 rep = charmapencode_lookup(p[collendpos], mapping);
4698 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004700 else if (rep!=Py_None) {
4701 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 break;
4703 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004704 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 ++collendpos;
4706 }
4707 /* cache callback name lookup
4708 * (if not done yet, i.e. it's the first error) */
4709 if (*known_errorHandler==-1) {
4710 if ((errors==NULL) || (!strcmp(errors, "strict")))
4711 *known_errorHandler = 1;
4712 else if (!strcmp(errors, "replace"))
4713 *known_errorHandler = 2;
4714 else if (!strcmp(errors, "ignore"))
4715 *known_errorHandler = 3;
4716 else if (!strcmp(errors, "xmlcharrefreplace"))
4717 *known_errorHandler = 4;
4718 else
4719 *known_errorHandler = 0;
4720 }
4721 switch (*known_errorHandler) {
4722 case 1: /* strict */
4723 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4724 return -1;
4725 case 2: /* replace */
4726 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4727 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004728 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 return -1;
4730 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004731 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4733 return -1;
4734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 }
4736 /* fall through */
4737 case 3: /* ignore */
4738 *inpos = collendpos;
4739 break;
4740 case 4: /* xmlcharrefreplace */
4741 /* generate replacement (temporarily (mis)uses p) */
4742 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4743 char buffer[2+29+1+1];
4744 char *cp;
4745 sprintf(buffer, "&#%d;", (int)p[collpos]);
4746 for (cp = buffer; *cp; ++cp) {
4747 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004748 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004750 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4752 return -1;
4753 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 }
4755 }
4756 *inpos = collendpos;
4757 break;
4758 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004759 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 encoding, reason, p, size, exceptionObject,
4761 collstartpos, collendpos, &newpos);
4762 if (repunicode == NULL)
4763 return -1;
4764 /* generate replacement */
4765 repsize = PyUnicode_GET_SIZE(repunicode);
4766 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4767 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004768 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 return -1;
4770 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004771 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4774 return -1;
4775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 }
4777 *inpos = newpos;
4778 Py_DECREF(repunicode);
4779 }
4780 return 0;
4781}
4782
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 PyObject *mapping,
4786 const char *errors)
4787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 /* output object */
4789 PyObject *res = NULL;
4790 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004793 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 PyObject *errorHandler = NULL;
4795 PyObject *exc = NULL;
4796 /* the following variable is used for caching string comparisons
4797 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4798 * 3=ignore, 4=xmlcharrefreplace */
4799 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
4801 /* Default to Latin-1 */
4802 if (mapping == NULL)
4803 return PyUnicode_EncodeLatin1(p, size, errors);
4804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 /* allocate enough for a simple encoding without
4806 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004807 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 if (res == NULL)
4809 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004810 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 while (inpos<size) {
4814 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004815 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004816 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004818 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 if (charmap_encoding_error(p, size, &inpos, mapping,
4820 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004821 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004822 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004823 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 else
4827 /* done with this character => adjust input position */
4828 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004832 if (respos<PyBytes_GET_SIZE(res))
4833 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 Py_XDECREF(exc);
4836 Py_XDECREF(errorHandler);
4837 return res;
4838
4839 onError:
4840 Py_XDECREF(res);
4841 Py_XDECREF(exc);
4842 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 return NULL;
4844}
4845
4846PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4847 PyObject *mapping)
4848{
4849 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4850 PyErr_BadArgument();
4851 return NULL;
4852 }
4853 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4854 PyUnicode_GET_SIZE(unicode),
4855 mapping,
4856 NULL);
4857}
4858
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859/* create or adjust a UnicodeTranslateError */
4860static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004861 const Py_UNICODE *unicode, Py_ssize_t size,
4862 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 if (*exceptionObject == NULL) {
4866 *exceptionObject = PyUnicodeTranslateError_Create(
4867 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
4869 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4871 goto onError;
4872 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4873 goto onError;
4874 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4875 goto onError;
4876 return;
4877 onError:
4878 Py_DECREF(*exceptionObject);
4879 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 }
4881}
4882
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883/* raises a UnicodeTranslateError */
4884static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004885 const Py_UNICODE *unicode, Py_ssize_t size,
4886 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 const char *reason)
4888{
4889 make_translate_exception(exceptionObject,
4890 unicode, size, startpos, endpos, reason);
4891 if (*exceptionObject != NULL)
4892 PyCodec_StrictErrors(*exceptionObject);
4893}
4894
4895/* error handling callback helper:
4896 build arguments, call the callback and check the arguments,
4897 put the result into newpos and return the replacement string, which
4898 has to be freed by the caller */
4899static PyObject *unicode_translate_call_errorhandler(const char *errors,
4900 PyObject **errorHandler,
4901 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4903 Py_ssize_t startpos, Py_ssize_t endpos,
4904 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004906 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004908 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 PyObject *restuple;
4910 PyObject *resunicode;
4911
4912 if (*errorHandler == NULL) {
4913 *errorHandler = PyCodec_LookupError(errors);
4914 if (*errorHandler == NULL)
4915 return NULL;
4916 }
4917
4918 make_translate_exception(exceptionObject,
4919 unicode, size, startpos, endpos, reason);
4920 if (*exceptionObject == NULL)
4921 return NULL;
4922
4923 restuple = PyObject_CallFunctionObjArgs(
4924 *errorHandler, *exceptionObject, NULL);
4925 if (restuple == NULL)
4926 return NULL;
4927 if (!PyTuple_Check(restuple)) {
4928 PyErr_Format(PyExc_TypeError, &argparse[4]);
4929 Py_DECREF(restuple);
4930 return NULL;
4931 }
4932 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 Py_DECREF(restuple);
4935 return NULL;
4936 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004937 if (i_newpos<0)
4938 *newpos = size+i_newpos;
4939 else
4940 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004941 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004942 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004943 Py_DECREF(restuple);
4944 return NULL;
4945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 Py_INCREF(resunicode);
4947 Py_DECREF(restuple);
4948 return resunicode;
4949}
4950
4951/* Lookup the character ch in the mapping and put the result in result,
4952 which must be decrefed by the caller.
4953 Return 0 on success, -1 on error */
4954static
4955int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4956{
Christian Heimes217cfd12007-12-02 14:31:20 +00004957 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 PyObject *x;
4959
4960 if (w == NULL)
4961 return -1;
4962 x = PyObject_GetItem(mapping, w);
4963 Py_DECREF(w);
4964 if (x == NULL) {
4965 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4966 /* No mapping found means: use 1:1 mapping. */
4967 PyErr_Clear();
4968 *result = NULL;
4969 return 0;
4970 } else
4971 return -1;
4972 }
4973 else if (x == Py_None) {
4974 *result = x;
4975 return 0;
4976 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004977 else if (PyLong_Check(x)) {
4978 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 long max = PyUnicode_GetMax();
4980 if (value < 0 || value > max) {
4981 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004982 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 Py_DECREF(x);
4984 return -1;
4985 }
4986 *result = x;
4987 return 0;
4988 }
4989 else if (PyUnicode_Check(x)) {
4990 *result = x;
4991 return 0;
4992 }
4993 else {
4994 /* wrong return value */
4995 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004996 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00004997 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 return -1;
4999 }
5000}
5001/* ensure that *outobj is at least requiredsize characters long,
5002if not reallocate and adjust various state variables.
5003Return 0 on success, -1 on error */
5004static
Walter Dörwald4894c302003-10-24 14:25:28 +00005005int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005008 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005009 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005011 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005013 if (requiredsize < 2 * oldsize)
5014 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005015 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 return -1;
5017 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 }
5019 return 0;
5020}
5021/* lookup the character, put the result in the output string and adjust
5022 various state variables. Return a new reference to the object that
5023 was put in the output buffer in *result, or Py_None, if the mapping was
5024 undefined (in which case no character was written).
5025 The called must decref result.
5026 Return 0 on success, -1 on error. */
5027static
Walter Dörwald4894c302003-10-24 14:25:28 +00005028int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005029 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005030 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031{
Walter Dörwald4894c302003-10-24 14:25:28 +00005032 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 return -1;
5034 if (*res==NULL) {
5035 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005036 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 }
5038 else if (*res==Py_None)
5039 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005040 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005042 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 }
5044 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005045 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 if (repsize==1) {
5047 /* no overflow check, because we know that the space is enough */
5048 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5049 }
5050 else if (repsize!=0) {
5051 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005052 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005053 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005054 repsize - 1;
5055 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 return -1;
5057 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5058 *outp += repsize;
5059 }
5060 }
5061 else
5062 return -1;
5063 return 0;
5064}
5065
5066PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005067 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 PyObject *mapping,
5069 const char *errors)
5070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 /* output object */
5072 PyObject *res = NULL;
5073 /* pointers to the beginning and end+1 of input */
5074 const Py_UNICODE *startp = p;
5075 const Py_UNICODE *endp = p + size;
5076 /* pointer into the output */
5077 Py_UNICODE *str;
5078 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005079 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 char *reason = "character maps to <undefined>";
5081 PyObject *errorHandler = NULL;
5082 PyObject *exc = NULL;
5083 /* the following variable is used for caching string comparisons
5084 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5085 * 3=ignore, 4=xmlcharrefreplace */
5086 int known_errorHandler = -1;
5087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 if (mapping == NULL) {
5089 PyErr_BadArgument();
5090 return NULL;
5091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092
5093 /* allocate enough for a simple 1:1 translation without
5094 replacements, if we need more, we'll resize */
5095 res = PyUnicode_FromUnicode(NULL, size);
5096 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 return res;
5100 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 while (p<endp) {
5103 /* try to encode it */
5104 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005105 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 goto onError;
5108 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005109 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 if (x!=Py_None) /* it worked => adjust input pointer */
5111 ++p;
5112 else { /* untranslatable character */
5113 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005114 Py_ssize_t repsize;
5115 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 Py_UNICODE *uni2;
5117 /* startpos for collecting untranslatable chars */
5118 const Py_UNICODE *collstart = p;
5119 const Py_UNICODE *collend = p+1;
5120 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 /* find all untranslatable characters */
5123 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005124 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 goto onError;
5126 Py_XDECREF(x);
5127 if (x!=Py_None)
5128 break;
5129 ++collend;
5130 }
5131 /* cache callback name lookup
5132 * (if not done yet, i.e. it's the first error) */
5133 if (known_errorHandler==-1) {
5134 if ((errors==NULL) || (!strcmp(errors, "strict")))
5135 known_errorHandler = 1;
5136 else if (!strcmp(errors, "replace"))
5137 known_errorHandler = 2;
5138 else if (!strcmp(errors, "ignore"))
5139 known_errorHandler = 3;
5140 else if (!strcmp(errors, "xmlcharrefreplace"))
5141 known_errorHandler = 4;
5142 else
5143 known_errorHandler = 0;
5144 }
5145 switch (known_errorHandler) {
5146 case 1: /* strict */
5147 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5148 goto onError;
5149 case 2: /* replace */
5150 /* No need to check for space, this is a 1:1 replacement */
5151 for (coll = collstart; coll<collend; ++coll)
5152 *str++ = '?';
5153 /* fall through */
5154 case 3: /* ignore */
5155 p = collend;
5156 break;
5157 case 4: /* xmlcharrefreplace */
5158 /* generate replacement (temporarily (mis)uses p) */
5159 for (p = collstart; p < collend; ++p) {
5160 char buffer[2+29+1+1];
5161 char *cp;
5162 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005163 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5165 goto onError;
5166 for (cp = buffer; *cp; ++cp)
5167 *str++ = *cp;
5168 }
5169 p = collend;
5170 break;
5171 default:
5172 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5173 reason, startp, size, &exc,
5174 collstart-startp, collend-startp, &newpos);
5175 if (repunicode == NULL)
5176 goto onError;
5177 /* generate replacement */
5178 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005179 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5181 Py_DECREF(repunicode);
5182 goto onError;
5183 }
5184 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5185 *str++ = *uni2;
5186 p = startp + newpos;
5187 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 }
5189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191 /* Resize if we allocated to much */
5192 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005193 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005194 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005195 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 }
5197 Py_XDECREF(exc);
5198 Py_XDECREF(errorHandler);
5199 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 onError:
5202 Py_XDECREF(res);
5203 Py_XDECREF(exc);
5204 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 return NULL;
5206}
5207
5208PyObject *PyUnicode_Translate(PyObject *str,
5209 PyObject *mapping,
5210 const char *errors)
5211{
5212 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 str = PyUnicode_FromObject(str);
5215 if (str == NULL)
5216 goto onError;
5217 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5218 PyUnicode_GET_SIZE(str),
5219 mapping,
5220 errors);
5221 Py_DECREF(str);
5222 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005223
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 onError:
5225 Py_XDECREF(str);
5226 return NULL;
5227}
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossum9e896b32000-04-05 20:11:21 +00005229/* --- Decimal Encoder ---------------------------------------------------- */
5230
5231int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233 char *output,
5234 const char *errors)
5235{
5236 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 PyObject *errorHandler = NULL;
5238 PyObject *exc = NULL;
5239 const char *encoding = "decimal";
5240 const char *reason = "invalid decimal Unicode string";
5241 /* the following variable is used for caching string comparisons
5242 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5243 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005244
5245 if (output == NULL) {
5246 PyErr_BadArgument();
5247 return -1;
5248 }
5249
5250 p = s;
5251 end = s + length;
5252 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005254 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005256 Py_ssize_t repsize;
5257 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005258 Py_UNICODE *uni2;
5259 Py_UNICODE *collstart;
5260 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Guido van Rossum9e896b32000-04-05 20:11:21 +00005262 if (Py_UNICODE_ISSPACE(ch)) {
5263 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005265 continue;
5266 }
5267 decimal = Py_UNICODE_TODECIMAL(ch);
5268 if (decimal >= 0) {
5269 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005271 continue;
5272 }
Guido van Rossumba477042000-04-06 18:18:10 +00005273 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005274 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005276 continue;
5277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005278 /* All other characters are considered unencodable */
5279 collstart = p;
5280 collend = p+1;
5281 while (collend < end) {
5282 if ((0 < *collend && *collend < 256) ||
5283 !Py_UNICODE_ISSPACE(*collend) ||
5284 Py_UNICODE_TODECIMAL(*collend))
5285 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 /* cache callback name lookup
5288 * (if not done yet, i.e. it's the first error) */
5289 if (known_errorHandler==-1) {
5290 if ((errors==NULL) || (!strcmp(errors, "strict")))
5291 known_errorHandler = 1;
5292 else if (!strcmp(errors, "replace"))
5293 known_errorHandler = 2;
5294 else if (!strcmp(errors, "ignore"))
5295 known_errorHandler = 3;
5296 else if (!strcmp(errors, "xmlcharrefreplace"))
5297 known_errorHandler = 4;
5298 else
5299 known_errorHandler = 0;
5300 }
5301 switch (known_errorHandler) {
5302 case 1: /* strict */
5303 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5304 goto onError;
5305 case 2: /* replace */
5306 for (p = collstart; p < collend; ++p)
5307 *output++ = '?';
5308 /* fall through */
5309 case 3: /* ignore */
5310 p = collend;
5311 break;
5312 case 4: /* xmlcharrefreplace */
5313 /* generate replacement (temporarily (mis)uses p) */
5314 for (p = collstart; p < collend; ++p)
5315 output += sprintf(output, "&#%d;", (int)*p);
5316 p = collend;
5317 break;
5318 default:
5319 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5320 encoding, reason, s, length, &exc,
5321 collstart-s, collend-s, &newpos);
5322 if (repunicode == NULL)
5323 goto onError;
5324 /* generate replacement */
5325 repsize = PyUnicode_GET_SIZE(repunicode);
5326 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5327 Py_UNICODE ch = *uni2;
5328 if (Py_UNICODE_ISSPACE(ch))
5329 *output++ = ' ';
5330 else {
5331 decimal = Py_UNICODE_TODECIMAL(ch);
5332 if (decimal >= 0)
5333 *output++ = '0' + decimal;
5334 else if (0 < ch && ch < 256)
5335 *output++ = (char)ch;
5336 else {
5337 Py_DECREF(repunicode);
5338 raise_encode_exception(&exc, encoding,
5339 s, length, collstart-s, collend-s, reason);
5340 goto onError;
5341 }
5342 }
5343 }
5344 p = s + newpos;
5345 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005346 }
5347 }
5348 /* 0-terminate the output string */
5349 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 Py_XDECREF(exc);
5351 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005352 return 0;
5353
5354 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 Py_XDECREF(exc);
5356 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005357 return -1;
5358}
5359
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360/* --- Helpers ------------------------------------------------------------ */
5361
Eric Smith8c663262007-08-25 02:26:07 +00005362#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005363#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005364#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005365/* Include _ParseTupleFinds from find.h */
5366#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367#include "stringlib/find.h"
5368#include "stringlib/partition.h"
5369
Eric Smith5807c412008-05-11 21:00:57 +00005370#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5371#include "stringlib/localeutil.h"
5372
Thomas Wouters477c8d52006-05-27 19:21:47 +00005373/* helper macro to fixup start/end slice values */
5374#define FIX_START_END(obj) \
5375 if (start < 0) \
5376 start += (obj)->length; \
5377 if (start < 0) \
5378 start = 0; \
5379 if (end > (obj)->length) \
5380 end = (obj)->length; \
5381 if (end < 0) \
5382 end += (obj)->length; \
5383 if (end < 0) \
5384 end = 0;
5385
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005387 PyObject *substr,
5388 Py_ssize_t start,
5389 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005391 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005392 PyUnicodeObject* str_obj;
5393 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005394
Thomas Wouters477c8d52006-05-27 19:21:47 +00005395 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5396 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005398 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5399 if (!sub_obj) {
5400 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 return -1;
5402 }
Tim Petersced69f82003-09-16 20:30:58 +00005403
Thomas Wouters477c8d52006-05-27 19:21:47 +00005404 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005405
Thomas Wouters477c8d52006-05-27 19:21:47 +00005406 result = stringlib_count(
5407 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5408 );
5409
5410 Py_DECREF(sub_obj);
5411 Py_DECREF(str_obj);
5412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 return result;
5414}
5415
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 PyObject *sub,
5418 Py_ssize_t start,
5419 Py_ssize_t end,
5420 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005422 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005425 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005426 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005427 sub = PyUnicode_FromObject(sub);
5428 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005429 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005430 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
Tim Petersced69f82003-09-16 20:30:58 +00005432
Thomas Wouters477c8d52006-05-27 19:21:47 +00005433 if (direction > 0)
5434 result = stringlib_find_slice(
5435 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5436 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5437 start, end
5438 );
5439 else
5440 result = stringlib_rfind_slice(
5441 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5442 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5443 start, end
5444 );
5445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005447 Py_DECREF(sub);
5448
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return result;
5450}
5451
Tim Petersced69f82003-09-16 20:30:58 +00005452static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453int tailmatch(PyUnicodeObject *self,
5454 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455 Py_ssize_t start,
5456 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 int direction)
5458{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 if (substring->length == 0)
5460 return 1;
5461
Thomas Wouters477c8d52006-05-27 19:21:47 +00005462 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
5464 end -= substring->length;
5465 if (end < start)
5466 return 0;
5467
5468 if (direction > 0) {
5469 if (Py_UNICODE_MATCH(self, end, substring))
5470 return 1;
5471 } else {
5472 if (Py_UNICODE_MATCH(self, start, substring))
5473 return 1;
5474 }
5475
5476 return 0;
5477}
5478
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t start,
5482 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 int direction)
5484{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005485 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005486
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 str = PyUnicode_FromObject(str);
5488 if (str == NULL)
5489 return -1;
5490 substr = PyUnicode_FromObject(substr);
5491 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005492 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 return -1;
5494 }
Tim Petersced69f82003-09-16 20:30:58 +00005495
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 result = tailmatch((PyUnicodeObject *)str,
5497 (PyUnicodeObject *)substr,
5498 start, end, direction);
5499 Py_DECREF(str);
5500 Py_DECREF(substr);
5501 return result;
5502}
5503
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504/* Apply fixfct filter to the Unicode object self and return a
5505 reference to the modified object */
5506
Tim Petersced69f82003-09-16 20:30:58 +00005507static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508PyObject *fixup(PyUnicodeObject *self,
5509 int (*fixfct)(PyUnicodeObject *s))
5510{
5511
5512 PyUnicodeObject *u;
5513
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005514 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 if (u == NULL)
5516 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005517
5518 Py_UNICODE_COPY(u->str, self->str, self->length);
5519
Tim Peters7a29bd52001-09-12 03:03:31 +00005520 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 /* fixfct should return TRUE if it modified the buffer. If
5522 FALSE, return a reference to the original buffer instead
5523 (to save space, not time) */
5524 Py_INCREF(self);
5525 Py_DECREF(u);
5526 return (PyObject*) self;
5527 }
5528 return (PyObject*) u;
5529}
5530
Tim Petersced69f82003-09-16 20:30:58 +00005531static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532int fixupper(PyUnicodeObject *self)
5533{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 Py_UNICODE *s = self->str;
5536 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005537
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 while (len-- > 0) {
5539 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 ch = Py_UNICODE_TOUPPER(*s);
5542 if (ch != *s) {
5543 status = 1;
5544 *s = ch;
5545 }
5546 s++;
5547 }
5548
5549 return status;
5550}
5551
Tim Petersced69f82003-09-16 20:30:58 +00005552static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553int fixlower(PyUnicodeObject *self)
5554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 Py_UNICODE *s = self->str;
5557 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 while (len-- > 0) {
5560 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005561
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 ch = Py_UNICODE_TOLOWER(*s);
5563 if (ch != *s) {
5564 status = 1;
5565 *s = ch;
5566 }
5567 s++;
5568 }
5569
5570 return status;
5571}
5572
Tim Petersced69f82003-09-16 20:30:58 +00005573static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574int fixswapcase(PyUnicodeObject *self)
5575{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005576 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 Py_UNICODE *s = self->str;
5578 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 while (len-- > 0) {
5581 if (Py_UNICODE_ISUPPER(*s)) {
5582 *s = Py_UNICODE_TOLOWER(*s);
5583 status = 1;
5584 } else if (Py_UNICODE_ISLOWER(*s)) {
5585 *s = Py_UNICODE_TOUPPER(*s);
5586 status = 1;
5587 }
5588 s++;
5589 }
5590
5591 return status;
5592}
5593
Tim Petersced69f82003-09-16 20:30:58 +00005594static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595int fixcapitalize(PyUnicodeObject *self)
5596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005597 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005598 Py_UNICODE *s = self->str;
5599 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005600
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005601 if (len == 0)
5602 return 0;
5603 if (Py_UNICODE_ISLOWER(*s)) {
5604 *s = Py_UNICODE_TOUPPER(*s);
5605 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005607 s++;
5608 while (--len > 0) {
5609 if (Py_UNICODE_ISUPPER(*s)) {
5610 *s = Py_UNICODE_TOLOWER(*s);
5611 status = 1;
5612 }
5613 s++;
5614 }
5615 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616}
5617
5618static
5619int fixtitle(PyUnicodeObject *self)
5620{
5621 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5622 register Py_UNICODE *e;
5623 int previous_is_cased;
5624
5625 /* Shortcut for single character strings */
5626 if (PyUnicode_GET_SIZE(self) == 1) {
5627 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5628 if (*p != ch) {
5629 *p = ch;
5630 return 1;
5631 }
5632 else
5633 return 0;
5634 }
Tim Petersced69f82003-09-16 20:30:58 +00005635
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 e = p + PyUnicode_GET_SIZE(self);
5637 previous_is_cased = 0;
5638 for (; p < e; p++) {
5639 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (previous_is_cased)
5642 *p = Py_UNICODE_TOLOWER(ch);
5643 else
5644 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005645
5646 if (Py_UNICODE_ISLOWER(ch) ||
5647 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 Py_UNICODE_ISTITLE(ch))
5649 previous_is_cased = 1;
5650 else
5651 previous_is_cased = 0;
5652 }
5653 return 1;
5654}
5655
Tim Peters8ce9f162004-08-27 01:49:32 +00005656PyObject *
5657PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658{
Skip Montanaro6543b452004-09-16 03:28:13 +00005659 const Py_UNICODE blank = ' ';
5660 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005661 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005662 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5664 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005665 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5666 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005667 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005668 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
Tim Peters05eba1f2004-08-27 21:32:02 +00005670 fseq = PySequence_Fast(seq, "");
5671 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005672 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005673 }
5674
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005675 /* NOTE: the following code can't call back into Python code,
5676 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005677 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005678
Tim Peters05eba1f2004-08-27 21:32:02 +00005679 seqlen = PySequence_Fast_GET_SIZE(fseq);
5680 /* If empty sequence, return u"". */
5681 if (seqlen == 0) {
5682 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5683 goto Done;
5684 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005685 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005686 /* If singleton sequence with an exact Unicode, return that. */
5687 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005688 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005689 if (PyUnicode_CheckExact(item)) {
5690 Py_INCREF(item);
5691 res = (PyUnicodeObject *)item;
5692 goto Done;
5693 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005694 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005695 else {
5696 /* Set up sep and seplen */
5697 if (separator == NULL) {
5698 sep = &blank;
5699 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005701 else {
5702 if (!PyUnicode_Check(separator)) {
5703 PyErr_Format(PyExc_TypeError,
5704 "separator: expected str instance,"
5705 " %.80s found",
5706 Py_TYPE(separator)->tp_name);
5707 goto onError;
5708 }
5709 sep = PyUnicode_AS_UNICODE(separator);
5710 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005711 }
5712 }
5713
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005714 /* There are at least two things to join, or else we have a subclass
5715 * of str in the sequence.
5716 * Do a pre-pass to figure out the total amount of space we'll
5717 * need (sz), and see whether all argument are strings.
5718 */
5719 sz = 0;
5720 for (i = 0; i < seqlen; i++) {
5721 const Py_ssize_t old_sz = sz;
5722 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005723 if (!PyUnicode_Check(item)) {
5724 PyErr_Format(PyExc_TypeError,
5725 "sequence item %zd: expected str instance,"
5726 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005727 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005728 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005729 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005730 sz += PyUnicode_GET_SIZE(item);
5731 if (i != 0)
5732 sz += seplen;
5733 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5734 PyErr_SetString(PyExc_OverflowError,
5735 "join() result is too long for a Python string");
5736 goto onError;
5737 }
5738 }
Tim Petersced69f82003-09-16 20:30:58 +00005739
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005740 res = _PyUnicode_New(sz);
5741 if (res == NULL)
5742 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005743
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005744 /* Catenate everything. */
5745 res_p = PyUnicode_AS_UNICODE(res);
5746 for (i = 0; i < seqlen; ++i) {
5747 Py_ssize_t itemlen;
5748 item = items[i];
5749 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005750 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005751 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005752 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005753 res_p += seplen;
5754 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005755 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5756 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005757 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005758
Tim Peters8ce9f162004-08-27 01:49:32 +00005759 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005760 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return (PyObject *)res;
5762
5763 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005764 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005765 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
5767}
5768
Tim Petersced69f82003-09-16 20:30:58 +00005769static
5770PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005771 Py_ssize_t left,
5772 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 Py_UNICODE fill)
5774{
5775 PyUnicodeObject *u;
5776
5777 if (left < 0)
5778 left = 0;
5779 if (right < 0)
5780 right = 0;
5781
Tim Peters7a29bd52001-09-12 03:03:31 +00005782 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 Py_INCREF(self);
5784 return self;
5785 }
5786
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005787 if (left > PY_SSIZE_T_MAX - self->length ||
5788 right > PY_SSIZE_T_MAX - (left + self->length)) {
5789 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5790 return NULL;
5791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 u = _PyUnicode_New(left + self->length + right);
5793 if (u) {
5794 if (left)
5795 Py_UNICODE_FILL(u->str, fill, left);
5796 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5797 if (right)
5798 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5799 }
5800
5801 return u;
5802}
5803
5804#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 if (!str) \
5807 goto onError; \
5808 if (PyList_Append(list, str)) { \
5809 Py_DECREF(str); \
5810 goto onError; \
5811 } \
5812 else \
5813 Py_DECREF(str);
5814
5815static
5816PyObject *split_whitespace(PyUnicodeObject *self,
5817 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005820 register Py_ssize_t i;
5821 register Py_ssize_t j;
5822 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005824 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
5826 for (i = j = 0; i < len; ) {
5827 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005828 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 i++;
5830 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005831 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 i++;
5833 if (j < i) {
5834 if (maxcount-- <= 0)
5835 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005836 SPLIT_APPEND(buf, j, i);
5837 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 i++;
5839 j = i;
5840 }
5841 }
5842 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005843 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 }
5845 return list;
5846
5847 onError:
5848 Py_DECREF(list);
5849 return NULL;
5850}
5851
5852PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005853 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855 register Py_ssize_t i;
5856 register Py_ssize_t j;
5857 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 PyObject *list;
5859 PyObject *str;
5860 Py_UNICODE *data;
5861
5862 string = PyUnicode_FromObject(string);
5863 if (string == NULL)
5864 return NULL;
5865 data = PyUnicode_AS_UNICODE(string);
5866 len = PyUnicode_GET_SIZE(string);
5867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 list = PyList_New(0);
5869 if (!list)
5870 goto onError;
5871
5872 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005873 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
5879 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005880 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 if (i < len) {
5882 if (data[i] == '\r' && i + 1 < len &&
5883 data[i+1] == '\n')
5884 i += 2;
5885 else
5886 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005887 if (keepends)
5888 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 }
Guido van Rossum86662912000-04-11 15:38:46 +00005890 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 j = i;
5892 }
5893 if (j < len) {
5894 SPLIT_APPEND(data, j, len);
5895 }
5896
5897 Py_DECREF(string);
5898 return list;
5899
5900 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005901 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 Py_DECREF(string);
5903 return NULL;
5904}
5905
Tim Petersced69f82003-09-16 20:30:58 +00005906static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907PyObject *split_char(PyUnicodeObject *self,
5908 PyObject *list,
5909 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912 register Py_ssize_t i;
5913 register Py_ssize_t j;
5914 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005916 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
5918 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005919 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 if (maxcount-- <= 0)
5921 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005922 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 i = j = i + 1;
5924 } else
5925 i++;
5926 }
5927 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005928 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
5930 return list;
5931
5932 onError:
5933 Py_DECREF(list);
5934 return NULL;
5935}
5936
Tim Petersced69f82003-09-16 20:30:58 +00005937static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938PyObject *split_substring(PyUnicodeObject *self,
5939 PyObject *list,
5940 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005941 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005943 register Py_ssize_t i;
5944 register Py_ssize_t j;
5945 Py_ssize_t len = self->length;
5946 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 PyObject *str;
5948
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005949 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (Py_UNICODE_MATCH(self, i, substring)) {
5951 if (maxcount-- <= 0)
5952 break;
5953 SPLIT_APPEND(self->str, j, i);
5954 i = j = i + sublen;
5955 } else
5956 i++;
5957 }
5958 if (j <= len) {
5959 SPLIT_APPEND(self->str, j, len);
5960 }
5961 return list;
5962
5963 onError:
5964 Py_DECREF(list);
5965 return NULL;
5966}
5967
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005968static
5969PyObject *rsplit_whitespace(PyUnicodeObject *self,
5970 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005971 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973 register Py_ssize_t i;
5974 register Py_ssize_t j;
5975 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005977 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005978
5979 for (i = j = len - 1; i >= 0; ) {
5980 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005981 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005982 i--;
5983 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005984 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985 i--;
5986 if (j > i) {
5987 if (maxcount-- <= 0)
5988 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005989 SPLIT_APPEND(buf, i + 1, j + 1);
5990 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991 i--;
5992 j = i;
5993 }
5994 }
5995 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005996 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005997 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 if (PyList_Reverse(list) < 0)
5999 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006000 return list;
6001
6002 onError:
6003 Py_DECREF(list);
6004 return NULL;
6005}
6006
6007static
6008PyObject *rsplit_char(PyUnicodeObject *self,
6009 PyObject *list,
6010 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 register Py_ssize_t i;
6014 register Py_ssize_t j;
6015 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006016 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006017 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006018
6019 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006020 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006021 if (maxcount-- <= 0)
6022 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006023 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006024 j = i = i - 1;
6025 } else
6026 i--;
6027 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006028 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006029 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006030 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006031 if (PyList_Reverse(list) < 0)
6032 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006033 return list;
6034
6035 onError:
6036 Py_DECREF(list);
6037 return NULL;
6038}
6039
6040static
6041PyObject *rsplit_substring(PyUnicodeObject *self,
6042 PyObject *list,
6043 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006045{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006046 register Py_ssize_t i;
6047 register Py_ssize_t j;
6048 Py_ssize_t len = self->length;
6049 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006050 PyObject *str;
6051
6052 for (i = len - sublen, j = len; i >= 0; ) {
6053 if (Py_UNICODE_MATCH(self, i, substring)) {
6054 if (maxcount-- <= 0)
6055 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006056 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006057 j = i;
6058 i -= sublen;
6059 } else
6060 i--;
6061 }
6062 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006063 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006064 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006065 if (PyList_Reverse(list) < 0)
6066 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006067 return list;
6068
6069 onError:
6070 Py_DECREF(list);
6071 return NULL;
6072}
6073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074#undef SPLIT_APPEND
6075
6076static
6077PyObject *split(PyUnicodeObject *self,
6078 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006079 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080{
6081 PyObject *list;
6082
6083 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006084 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086 list = PyList_New(0);
6087 if (!list)
6088 return NULL;
6089
6090 if (substring == NULL)
6091 return split_whitespace(self,list,maxcount);
6092
6093 else if (substring->length == 1)
6094 return split_char(self,list,substring->str[0],maxcount);
6095
6096 else if (substring->length == 0) {
6097 Py_DECREF(list);
6098 PyErr_SetString(PyExc_ValueError, "empty separator");
6099 return NULL;
6100 }
6101 else
6102 return split_substring(self,list,substring,maxcount);
6103}
6104
Tim Petersced69f82003-09-16 20:30:58 +00006105static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006106PyObject *rsplit(PyUnicodeObject *self,
6107 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006109{
6110 PyObject *list;
6111
6112 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006113 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006114
6115 list = PyList_New(0);
6116 if (!list)
6117 return NULL;
6118
6119 if (substring == NULL)
6120 return rsplit_whitespace(self,list,maxcount);
6121
6122 else if (substring->length == 1)
6123 return rsplit_char(self,list,substring->str[0],maxcount);
6124
6125 else if (substring->length == 0) {
6126 Py_DECREF(list);
6127 PyErr_SetString(PyExc_ValueError, "empty separator");
6128 return NULL;
6129 }
6130 else
6131 return rsplit_substring(self,list,substring,maxcount);
6132}
6133
6134static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135PyObject *replace(PyUnicodeObject *self,
6136 PyUnicodeObject *str1,
6137 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006138 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
6140 PyUnicodeObject *u;
6141
6142 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006143 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
Thomas Wouters477c8d52006-05-27 19:21:47 +00006145 if (str1->length == str2->length) {
6146 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006147 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006148 if (str1->length == 1) {
6149 /* replace characters */
6150 Py_UNICODE u1, u2;
6151 if (!findchar(self->str, self->length, str1->str[0]))
6152 goto nothing;
6153 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6154 if (!u)
6155 return NULL;
6156 Py_UNICODE_COPY(u->str, self->str, self->length);
6157 u1 = str1->str[0];
6158 u2 = str2->str[0];
6159 for (i = 0; i < u->length; i++)
6160 if (u->str[i] == u1) {
6161 if (--maxcount < 0)
6162 break;
6163 u->str[i] = u2;
6164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006166 i = fastsearch(
6167 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006169 if (i < 0)
6170 goto nothing;
6171 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6172 if (!u)
6173 return NULL;
6174 Py_UNICODE_COPY(u->str, self->str, self->length);
6175 while (i <= self->length - str1->length)
6176 if (Py_UNICODE_MATCH(self, i, str1)) {
6177 if (--maxcount < 0)
6178 break;
6179 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6180 i += str1->length;
6181 } else
6182 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006185
6186 Py_ssize_t n, i, j, e;
6187 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 Py_UNICODE *p;
6189
6190 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006191 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 if (n > maxcount)
6193 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006194 if (n == 0)
6195 goto nothing;
6196 /* new_size = self->length + n * (str2->length - str1->length)); */
6197 delta = (str2->length - str1->length);
6198 if (delta == 0) {
6199 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006201 product = n * (str2->length - str1->length);
6202 if ((product / (str2->length - str1->length)) != n) {
6203 PyErr_SetString(PyExc_OverflowError,
6204 "replace string is too long");
6205 return NULL;
6206 }
6207 new_size = self->length + product;
6208 if (new_size < 0) {
6209 PyErr_SetString(PyExc_OverflowError,
6210 "replace string is too long");
6211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 }
6213 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006214 u = _PyUnicode_New(new_size);
6215 if (!u)
6216 return NULL;
6217 i = 0;
6218 p = u->str;
6219 e = self->length - str1->length;
6220 if (str1->length > 0) {
6221 while (n-- > 0) {
6222 /* look for next match */
6223 j = i;
6224 while (j <= e) {
6225 if (Py_UNICODE_MATCH(self, j, str1))
6226 break;
6227 j++;
6228 }
6229 if (j > i) {
6230 if (j > e)
6231 break;
6232 /* copy unchanged part [i:j] */
6233 Py_UNICODE_COPY(p, self->str+i, j-i);
6234 p += j - i;
6235 }
6236 /* copy substitution string */
6237 if (str2->length > 0) {
6238 Py_UNICODE_COPY(p, str2->str, str2->length);
6239 p += str2->length;
6240 }
6241 i = j + str1->length;
6242 }
6243 if (i < self->length)
6244 /* copy tail [i:] */
6245 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6246 } else {
6247 /* interleave */
6248 while (n > 0) {
6249 Py_UNICODE_COPY(p, str2->str, str2->length);
6250 p += str2->length;
6251 if (--n <= 0)
6252 break;
6253 *p++ = self->str[i++];
6254 }
6255 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006259
6260nothing:
6261 /* nothing to replace; return original string (when possible) */
6262 if (PyUnicode_CheckExact(self)) {
6263 Py_INCREF(self);
6264 return (PyObject *) self;
6265 }
6266 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267}
6268
6269/* --- Unicode Object Methods --------------------------------------------- */
6270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006271PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006272"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273\n\
6274Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006275characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
6277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006278unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return fixup(self, fixtitle);
6281}
6282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006283PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006284"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285\n\
6286Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006287have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
6289static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006290unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 return fixup(self, fixcapitalize);
6293}
6294
6295#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006297"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298\n\
6299Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006303unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
6305 PyObject *list;
6306 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 /* Split into words */
6310 list = split(self, NULL, -1);
6311 if (!list)
6312 return NULL;
6313
6314 /* Capitalize each word */
6315 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6316 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6317 fixcapitalize);
6318 if (item == NULL)
6319 goto onError;
6320 Py_DECREF(PyList_GET_ITEM(list, i));
6321 PyList_SET_ITEM(list, i, item);
6322 }
6323
6324 /* Join the words to form a new string */
6325 item = PyUnicode_Join(NULL, list);
6326
6327onError:
6328 Py_DECREF(list);
6329 return (PyObject *)item;
6330}
6331#endif
6332
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006333/* Argument converter. Coerces to a single unicode character */
6334
6335static int
6336convert_uc(PyObject *obj, void *addr)
6337{
6338 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6339 PyObject *uniobj;
6340 Py_UNICODE *unistr;
6341
6342 uniobj = PyUnicode_FromObject(obj);
6343 if (uniobj == NULL) {
6344 PyErr_SetString(PyExc_TypeError,
6345 "The fill character cannot be converted to Unicode");
6346 return 0;
6347 }
6348 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6349 PyErr_SetString(PyExc_TypeError,
6350 "The fill character must be exactly one character long");
6351 Py_DECREF(uniobj);
6352 return 0;
6353 }
6354 unistr = PyUnicode_AS_UNICODE(uniobj);
6355 *fillcharloc = unistr[0];
6356 Py_DECREF(uniobj);
6357 return 1;
6358}
6359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006360PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006361"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006363Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006364done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
6366static PyObject *
6367unicode_center(PyUnicodeObject *self, PyObject *args)
6368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006369 Py_ssize_t marg, left;
6370 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006371 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Thomas Woutersde017742006-02-16 19:34:37 +00006373 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 return NULL;
6375
Tim Peters7a29bd52001-09-12 03:03:31 +00006376 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 Py_INCREF(self);
6378 return (PyObject*) self;
6379 }
6380
6381 marg = width - self->length;
6382 left = marg / 2 + (marg & width & 1);
6383
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006384 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385}
6386
Marc-André Lemburge5034372000-08-08 08:04:29 +00006387#if 0
6388
6389/* This code should go into some future Unicode collation support
6390 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006391 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006392
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006393/* speedy UTF-16 code point order comparison */
6394/* gleaned from: */
6395/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6396
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006397static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006398{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006399 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006400 0, 0, 0, 0, 0, 0, 0, 0,
6401 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006402 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006403};
6404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405static int
6406unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6407{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 Py_UNICODE *s1 = str1->str;
6411 Py_UNICODE *s2 = str2->str;
6412
6413 len1 = str1->length;
6414 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006417 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006418
6419 c1 = *s1++;
6420 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006421
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006422 if (c1 > (1<<11) * 26)
6423 c1 += utf16Fixup[c1>>11];
6424 if (c2 > (1<<11) * 26)
6425 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006426 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006427
6428 if (c1 != c2)
6429 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006430
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006431 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 }
6433
6434 return (len1 < len2) ? -1 : (len1 != len2);
6435}
6436
Marc-André Lemburge5034372000-08-08 08:04:29 +00006437#else
6438
6439static int
6440unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006442 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006443
6444 Py_UNICODE *s1 = str1->str;
6445 Py_UNICODE *s2 = str2->str;
6446
6447 len1 = str1->length;
6448 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Marc-André Lemburge5034372000-08-08 08:04:29 +00006450 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006451 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006452
Fredrik Lundh45714e92001-06-26 16:39:36 +00006453 c1 = *s1++;
6454 c2 = *s2++;
6455
6456 if (c1 != c2)
6457 return (c1 < c2) ? -1 : 1;
6458
Marc-André Lemburge5034372000-08-08 08:04:29 +00006459 len1--; len2--;
6460 }
6461
6462 return (len1 < len2) ? -1 : (len1 != len2);
6463}
6464
6465#endif
6466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467int PyUnicode_Compare(PyObject *left,
6468 PyObject *right)
6469{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006470 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6471 return unicode_compare((PyUnicodeObject *)left,
6472 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006473 PyErr_Format(PyExc_TypeError,
6474 "Can't compare %.100s and %.100s",
6475 left->ob_type->tp_name,
6476 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 return -1;
6478}
6479
Martin v. Löwis5b222132007-06-10 09:51:05 +00006480int
6481PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6482{
6483 int i;
6484 Py_UNICODE *id;
6485 assert(PyUnicode_Check(uni));
6486 id = PyUnicode_AS_UNICODE(uni);
6487 /* Compare Unicode string and source character set string */
6488 for (i = 0; id[i] && str[i]; i++)
6489 if (id[i] != str[i])
6490 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6491 if (id[i])
6492 return 1; /* uni is longer */
6493 if (str[i])
6494 return -1; /* str is longer */
6495 return 0;
6496}
6497
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006498PyObject *PyUnicode_RichCompare(PyObject *left,
6499 PyObject *right,
6500 int op)
6501{
6502 int result;
6503
6504 result = PyUnicode_Compare(left, right);
6505 if (result == -1 && PyErr_Occurred())
6506 goto onError;
6507
6508 /* Convert the return value to a Boolean */
6509 switch (op) {
6510 case Py_EQ:
6511 result = (result == 0);
6512 break;
6513 case Py_NE:
6514 result = (result != 0);
6515 break;
6516 case Py_LE:
6517 result = (result <= 0);
6518 break;
6519 case Py_GE:
6520 result = (result >= 0);
6521 break;
6522 case Py_LT:
6523 result = (result == -1);
6524 break;
6525 case Py_GT:
6526 result = (result == 1);
6527 break;
6528 }
6529 return PyBool_FromLong(result);
6530
6531 onError:
6532
6533 /* Standard case
6534
6535 Type errors mean that PyUnicode_FromObject() could not convert
6536 one of the arguments (usually the right hand side) to Unicode,
6537 ie. we can't handle the comparison request. However, it is
6538 possible that the other object knows a comparison method, which
6539 is why we return Py_NotImplemented to give the other object a
6540 chance.
6541
6542 */
6543 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6544 PyErr_Clear();
6545 Py_INCREF(Py_NotImplemented);
6546 return Py_NotImplemented;
6547 }
6548 if (op != Py_EQ && op != Py_NE)
6549 return NULL;
6550
6551 /* Equality comparison.
6552
6553 This is a special case: we silence any PyExc_UnicodeDecodeError
6554 and instead turn it into a PyErr_UnicodeWarning.
6555
6556 */
6557 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6558 return NULL;
6559 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006560 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6561 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006562 "equal comparison "
6563 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006564 "interpreting them as being unequal"
6565 :
6566 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006567 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006568 "interpreting them as being unequal",
6569 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006570 return NULL;
6571 result = (op == Py_NE);
6572 return PyBool_FromLong(result);
6573}
6574
Guido van Rossum403d68b2000-03-13 15:55:09 +00006575int PyUnicode_Contains(PyObject *container,
6576 PyObject *element)
6577{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006580
6581 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006582 sub = PyUnicode_FromObject(element);
6583 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006584 PyErr_Format(PyExc_TypeError,
6585 "'in <string>' requires string as left operand, not %s",
6586 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006588 }
6589
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590 str = PyUnicode_FromObject(container);
6591 if (!str) {
6592 Py_DECREF(sub);
6593 return -1;
6594 }
6595
6596 result = stringlib_contains_obj(str, sub);
6597
6598 Py_DECREF(str);
6599 Py_DECREF(sub);
6600
Guido van Rossum403d68b2000-03-13 15:55:09 +00006601 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006602}
6603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604/* Concat to string or Unicode object giving a new Unicode object. */
6605
6606PyObject *PyUnicode_Concat(PyObject *left,
6607 PyObject *right)
6608{
6609 PyUnicodeObject *u = NULL, *v = NULL, *w;
6610
6611 /* Coerce the two arguments */
6612 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6613 if (u == NULL)
6614 goto onError;
6615 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6616 if (v == NULL)
6617 goto onError;
6618
6619 /* Shortcuts */
6620 if (v == unicode_empty) {
6621 Py_DECREF(v);
6622 return (PyObject *)u;
6623 }
6624 if (u == unicode_empty) {
6625 Py_DECREF(u);
6626 return (PyObject *)v;
6627 }
6628
6629 /* Concat the two Unicode strings */
6630 w = _PyUnicode_New(u->length + v->length);
6631 if (w == NULL)
6632 goto onError;
6633 Py_UNICODE_COPY(w->str, u->str, u->length);
6634 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6635
6636 Py_DECREF(u);
6637 Py_DECREF(v);
6638 return (PyObject *)w;
6639
6640onError:
6641 Py_XDECREF(u);
6642 Py_XDECREF(v);
6643 return NULL;
6644}
6645
Walter Dörwald1ab83302007-05-18 17:15:44 +00006646void
6647PyUnicode_Append(PyObject **pleft, PyObject *right)
6648{
6649 PyObject *new;
6650 if (*pleft == NULL)
6651 return;
6652 if (right == NULL || !PyUnicode_Check(*pleft)) {
6653 Py_DECREF(*pleft);
6654 *pleft = NULL;
6655 return;
6656 }
6657 new = PyUnicode_Concat(*pleft, right);
6658 Py_DECREF(*pleft);
6659 *pleft = new;
6660}
6661
6662void
6663PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6664{
6665 PyUnicode_Append(pleft, right);
6666 Py_XDECREF(right);
6667}
6668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006669PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670"S.count(sub[, start[, end]]) -> int\n\
6671\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006672Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006673string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
6676static PyObject *
6677unicode_count(PyUnicodeObject *self, PyObject *args)
6678{
6679 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006680 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006681 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 PyObject *result;
6683
Guido van Rossumb8872e62000-05-09 14:14:27 +00006684 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6685 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 return NULL;
6687
6688 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 if (substring == NULL)
6691 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006692
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
Christian Heimes217cfd12007-12-02 14:31:20 +00006695 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 stringlib_count(self->str + start, end - start,
6697 substring->str, substring->length)
6698 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006701
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 return result;
6703}
6704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006705PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006706"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006708Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006709to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006710handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6712'xmlcharrefreplace' as well as any other name registered with\n\
6713codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject *
6716unicode_encode(PyUnicodeObject *self, PyObject *args)
6717{
6718 char *encoding = NULL;
6719 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006720 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6723 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006724 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006725 if (v == NULL)
6726 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006727 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006728 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006729 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006730 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006731 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006732 Py_DECREF(v);
6733 return NULL;
6734 }
6735 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006736
6737 onError:
6738 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006739}
6740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006742"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743\n\
6744Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747static PyObject*
6748unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6749{
6750 Py_UNICODE *e;
6751 Py_UNICODE *p;
6752 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006753 Py_UNICODE *qe;
6754 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 PyUnicodeObject *u;
6756 int tabsize = 8;
6757
6758 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6759 return NULL;
6760
Thomas Wouters7e474022000-07-16 12:04:32 +00006761 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006762 i = 0; /* chars up to and including most recent \n or \r */
6763 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6764 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 for (p = self->str; p < e; p++)
6766 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006767 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006768 incr = tabsize - (j % tabsize); /* cannot overflow */
6769 if (j > PY_SSIZE_T_MAX - incr)
6770 goto overflow1;
6771 j += incr;
6772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
6774 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006775 if (j > PY_SSIZE_T_MAX - 1)
6776 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 j++;
6778 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006779 if (i > PY_SSIZE_T_MAX - j)
6780 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006782 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
6784 }
6785
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006786 if (i > PY_SSIZE_T_MAX - j)
6787 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 /* Second pass: create output string and fill it */
6790 u = _PyUnicode_New(i + j);
6791 if (!u)
6792 return NULL;
6793
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006794 j = 0; /* same as in first pass */
6795 q = u->str; /* next output char */
6796 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
6798 for (p = self->str; p < e; p++)
6799 if (*p == '\t') {
6800 if (tabsize > 0) {
6801 i = tabsize - (j % tabsize);
6802 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006803 while (i--) {
6804 if (q >= qe)
6805 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 }
6809 }
6810 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006811 if (q >= qe)
6812 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006814 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 if (*p == '\n' || *p == '\r')
6816 j = 0;
6817 }
6818
6819 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006820
6821 overflow2:
6822 Py_DECREF(u);
6823 overflow1:
6824 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006829"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
6831Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006832such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833arguments start and end are interpreted as in slice notation.\n\
6834\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006835Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837static PyObject *
6838unicode_find(PyUnicodeObject *self, PyObject *args)
6839{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006840 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006841 Py_ssize_t start;
6842 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006843 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Christian Heimes9cd17752007-11-18 19:35:23 +00006845 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
Thomas Wouters477c8d52006-05-27 19:21:47 +00006848 result = stringlib_find_slice(
6849 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6850 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6851 start, end
6852 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853
6854 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006855
Christian Heimes217cfd12007-12-02 14:31:20 +00006856 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857}
6858
6859static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006860unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
6862 if (index < 0 || index >= self->length) {
6863 PyErr_SetString(PyExc_IndexError, "string index out of range");
6864 return NULL;
6865 }
6866
6867 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6868}
6869
Guido van Rossumc2504932007-09-18 19:42:40 +00006870/* Believe it or not, this produces the same value for ASCII strings
6871 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006873unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Guido van Rossumc2504932007-09-18 19:42:40 +00006875 Py_ssize_t len;
6876 Py_UNICODE *p;
6877 long x;
6878
6879 if (self->hash != -1)
6880 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006881 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006882 p = self->str;
6883 x = *p << 7;
6884 while (--len >= 0)
6885 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006886 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006887 if (x == -1)
6888 x = -2;
6889 self->hash = x;
6890 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891}
6892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006894"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898static PyObject *
6899unicode_index(PyUnicodeObject *self, PyObject *args)
6900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006901 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006902 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006903 Py_ssize_t start;
6904 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
Christian Heimes9cd17752007-11-18 19:35:23 +00006906 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
Thomas Wouters477c8d52006-05-27 19:21:47 +00006909 result = stringlib_find_slice(
6910 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6911 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6912 start, end
6913 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006916
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 if (result < 0) {
6918 PyErr_SetString(PyExc_ValueError, "substring not found");
6919 return NULL;
6920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006921
Christian Heimes217cfd12007-12-02 14:31:20 +00006922 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923}
6924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
6931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
6934 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6935 register const Py_UNICODE *e;
6936 int cased;
6937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 /* Shortcut for single character strings */
6939 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006942 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006943 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 e = p + PyUnicode_GET_SIZE(self);
6947 cased = 0;
6948 for (; p < e; p++) {
6949 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 else if (!cased && Py_UNICODE_ISLOWER(ch))
6954 cased = 1;
6955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006962Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006963at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
6965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006966unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967{
6968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6969 register const Py_UNICODE *e;
6970 int cased;
6971
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 /* Shortcut for single character strings */
6973 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006976 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006977 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 e = p + PyUnicode_GET_SIZE(self);
6981 cased = 0;
6982 for (; p < e; p++) {
6983 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 else if (!cased && Py_UNICODE_ISUPPER(ch))
6988 cased = 1;
6989 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991}
6992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006994"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006996Return True if S is a titlecased string and there is at least one\n\
6997character in S, i.e. upper- and titlecase characters may only\n\
6998follow uncased characters and lowercase characters only cased ones.\n\
6999Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000
7001static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007002unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003{
7004 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7005 register const Py_UNICODE *e;
7006 int cased, previous_is_cased;
7007
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007010 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7011 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007014 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 e = p + PyUnicode_GET_SIZE(self);
7018 cased = 0;
7019 previous_is_cased = 0;
7020 for (; p < e; p++) {
7021 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007022
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7024 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 previous_is_cased = 1;
7027 cased = 1;
7028 }
7029 else if (Py_UNICODE_ISLOWER(ch)) {
7030 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007031 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 previous_is_cased = 1;
7033 cased = 1;
7034 }
7035 else
7036 previous_is_cased = 0;
7037 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007041PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007042"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007044Return True if all characters in S are whitespace\n\
7045and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007048unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
7050 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7051 register const Py_UNICODE *e;
7052
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 /* Shortcut for single character strings */
7054 if (PyUnicode_GET_SIZE(self) == 1 &&
7055 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007056 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007058 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007059 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007060 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007061
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 e = p + PyUnicode_GET_SIZE(self);
7063 for (; p < e; p++) {
7064 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007065 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007067 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068}
7069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007070PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007071"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007072\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007073Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007074and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007075
7076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007077unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007078{
7079 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7080 register const Py_UNICODE *e;
7081
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007082 /* Shortcut for single character strings */
7083 if (PyUnicode_GET_SIZE(self) == 1 &&
7084 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007085 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007086
7087 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007088 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007089 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007090
7091 e = p + PyUnicode_GET_SIZE(self);
7092 for (; p < e; p++) {
7093 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007094 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007095 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007096 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007097}
7098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007099PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007100"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007101\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007102Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007103and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007104
7105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007106unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007107{
7108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7109 register const Py_UNICODE *e;
7110
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007111 /* Shortcut for single character strings */
7112 if (PyUnicode_GET_SIZE(self) == 1 &&
7113 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007114 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007115
7116 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007117 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007118 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007119
7120 e = p + PyUnicode_GET_SIZE(self);
7121 for (; p < e; p++) {
7122 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007123 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007124 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007126}
7127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007129"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007131Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007135unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
7137 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7138 register const Py_UNICODE *e;
7139
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 /* Shortcut for single character strings */
7141 if (PyUnicode_GET_SIZE(self) == 1 &&
7142 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007143 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007145 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007146 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007147 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 e = p + PyUnicode_GET_SIZE(self);
7150 for (; p < e; p++) {
7151 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007152 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007154 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155}
7156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007157PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007158"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007160Return True if all characters in S are digits\n\
7161and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
7163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007164unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165{
7166 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7167 register const Py_UNICODE *e;
7168
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 /* Shortcut for single character strings */
7170 if (PyUnicode_GET_SIZE(self) == 1 &&
7171 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007172 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007174 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007175 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007176 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 e = p + PyUnicode_GET_SIZE(self);
7179 for (; p < e; p++) {
7180 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007181 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007183 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184}
7185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007186PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007187"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007189Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007193unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194{
7195 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7196 register const Py_UNICODE *e;
7197
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 /* Shortcut for single character strings */
7199 if (PyUnicode_GET_SIZE(self) == 1 &&
7200 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007201 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007203 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007204 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007205 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007206
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 e = p + PyUnicode_GET_SIZE(self);
7208 for (; p < e; p++) {
7209 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007212 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213}
7214
Martin v. Löwis47383402007-08-15 07:32:56 +00007215int
7216PyUnicode_IsIdentifier(PyObject *self)
7217{
7218 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7219 register const Py_UNICODE *e;
7220
7221 /* Special case for empty strings */
7222 if (PyUnicode_GET_SIZE(self) == 0)
7223 return 0;
7224
7225 /* PEP 3131 says that the first character must be in
7226 XID_Start and subsequent characters in XID_Continue,
7227 and for the ASCII range, the 2.x rules apply (i.e
7228 start with letters and underscore, continue with
7229 letters, digits, underscore). However, given the current
7230 definition of XID_Start and XID_Continue, it is sufficient
7231 to check just for these, except that _ must be allowed
7232 as starting an identifier. */
7233 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7234 return 0;
7235
7236 e = p + PyUnicode_GET_SIZE(self);
7237 for (p++; p < e; p++) {
7238 if (!_PyUnicode_IsXidContinue(*p))
7239 return 0;
7240 }
7241 return 1;
7242}
7243
7244PyDoc_STRVAR(isidentifier__doc__,
7245"S.isidentifier() -> bool\n\
7246\n\
7247Return True if S is a valid identifier according\n\
7248to the language definition.");
7249
7250static PyObject*
7251unicode_isidentifier(PyObject *self)
7252{
7253 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7254}
7255
Georg Brandl559e5d72008-06-11 18:37:52 +00007256PyDoc_STRVAR(isprintable__doc__,
7257"S.isprintable() -> bool\n\
7258\n\
7259Return True if all characters in S are considered\n\
7260printable in repr() or S is empty, False otherwise.");
7261
7262static PyObject*
7263unicode_isprintable(PyObject *self)
7264{
7265 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7266 register const Py_UNICODE *e;
7267
7268 /* Shortcut for single character strings */
7269 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7270 Py_RETURN_TRUE;
7271 }
7272
7273 e = p + PyUnicode_GET_SIZE(self);
7274 for (; p < e; p++) {
7275 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7276 Py_RETURN_FALSE;
7277 }
7278 }
7279 Py_RETURN_TRUE;
7280}
7281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007282PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007283"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284\n\
7285Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007286sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007289unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007291 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292}
7293
Martin v. Löwis18e16552006-02-15 17:27:45 +00007294static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295unicode_length(PyUnicodeObject *self)
7296{
7297 return self->length;
7298}
7299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007301"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302\n\
7303Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007304done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
7306static PyObject *
7307unicode_ljust(PyUnicodeObject *self, PyObject *args)
7308{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007309 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007310 Py_UNICODE fillchar = ' ';
7311
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007312 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 return NULL;
7314
Tim Peters7a29bd52001-09-12 03:03:31 +00007315 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 Py_INCREF(self);
7317 return (PyObject*) self;
7318 }
7319
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007320 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321}
7322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007324"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
7328static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 return fixup(self, fixlower);
7332}
7333
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007334#define LEFTSTRIP 0
7335#define RIGHTSTRIP 1
7336#define BOTHSTRIP 2
7337
7338/* Arrays indexed by above */
7339static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7340
7341#define STRIPNAME(i) (stripformat[i]+3)
7342
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343/* externally visible for str.strip(unicode) */
7344PyObject *
7345_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7346{
7347 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007348 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007349 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007350 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7351 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007352
Thomas Wouters477c8d52006-05-27 19:21:47 +00007353 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7354
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007355 i = 0;
7356 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007357 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7358 i++;
7359 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007360 }
7361
7362 j = len;
7363 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007364 do {
7365 j--;
7366 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7367 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007368 }
7369
7370 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007371 Py_INCREF(self);
7372 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007373 }
7374 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007375 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007376}
7377
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
7379static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007380do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007382 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007383 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007384
7385 i = 0;
7386 if (striptype != RIGHTSTRIP) {
7387 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7388 i++;
7389 }
7390 }
7391
7392 j = len;
7393 if (striptype != LEFTSTRIP) {
7394 do {
7395 j--;
7396 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7397 j++;
7398 }
7399
7400 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7401 Py_INCREF(self);
7402 return (PyObject*)self;
7403 }
7404 else
7405 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406}
7407
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007408
7409static PyObject *
7410do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7411{
7412 PyObject *sep = NULL;
7413
7414 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7415 return NULL;
7416
7417 if (sep != NULL && sep != Py_None) {
7418 if (PyUnicode_Check(sep))
7419 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007420 else {
7421 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007422 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007423 STRIPNAME(striptype));
7424 return NULL;
7425 }
7426 }
7427
7428 return do_strip(self, striptype);
7429}
7430
7431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007432PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007433"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007434\n\
7435Return a copy of the string S with leading and trailing\n\
7436whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007437If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007438
7439static PyObject *
7440unicode_strip(PyUnicodeObject *self, PyObject *args)
7441{
7442 if (PyTuple_GET_SIZE(args) == 0)
7443 return do_strip(self, BOTHSTRIP); /* Common case */
7444 else
7445 return do_argstrip(self, BOTHSTRIP, args);
7446}
7447
7448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007449PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007450"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007451\n\
7452Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007453If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007454
7455static PyObject *
7456unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7457{
7458 if (PyTuple_GET_SIZE(args) == 0)
7459 return do_strip(self, LEFTSTRIP); /* Common case */
7460 else
7461 return do_argstrip(self, LEFTSTRIP, args);
7462}
7463
7464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007465PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007466"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007467\n\
7468Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007469If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007470
7471static PyObject *
7472unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7473{
7474 if (PyTuple_GET_SIZE(args) == 0)
7475 return do_strip(self, RIGHTSTRIP); /* Common case */
7476 else
7477 return do_argstrip(self, RIGHTSTRIP, args);
7478}
7479
7480
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007482unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
7484 PyUnicodeObject *u;
7485 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007486 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007487 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
7489 if (len < 0)
7490 len = 0;
7491
Tim Peters7a29bd52001-09-12 03:03:31 +00007492 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 /* no repeat, return original string */
7494 Py_INCREF(str);
7495 return (PyObject*) str;
7496 }
Tim Peters8f422462000-09-09 06:13:41 +00007497
7498 /* ensure # of chars needed doesn't overflow int and # of bytes
7499 * needed doesn't overflow size_t
7500 */
7501 nchars = len * str->length;
7502 if (len && nchars / len != str->length) {
7503 PyErr_SetString(PyExc_OverflowError,
7504 "repeated string is too long");
7505 return NULL;
7506 }
7507 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7508 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7509 PyErr_SetString(PyExc_OverflowError,
7510 "repeated string is too long");
7511 return NULL;
7512 }
7513 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 if (!u)
7515 return NULL;
7516
7517 p = u->str;
7518
Thomas Wouters477c8d52006-05-27 19:21:47 +00007519 if (str->length == 1 && len > 0) {
7520 Py_UNICODE_FILL(p, str->str[0], len);
7521 } else {
7522 Py_ssize_t done = 0; /* number of characters copied this far */
7523 if (done < nchars) {
7524 Py_UNICODE_COPY(p, str->str, str->length);
7525 done = str->length;
7526 }
7527 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007528 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007529 Py_UNICODE_COPY(p+done, p, n);
7530 done += n;
7531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 }
7533
7534 return (PyObject*) u;
7535}
7536
7537PyObject *PyUnicode_Replace(PyObject *obj,
7538 PyObject *subobj,
7539 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
7542 PyObject *self;
7543 PyObject *str1;
7544 PyObject *str2;
7545 PyObject *result;
7546
7547 self = PyUnicode_FromObject(obj);
7548 if (self == NULL)
7549 return NULL;
7550 str1 = PyUnicode_FromObject(subobj);
7551 if (str1 == NULL) {
7552 Py_DECREF(self);
7553 return NULL;
7554 }
7555 str2 = PyUnicode_FromObject(replobj);
7556 if (str2 == NULL) {
7557 Py_DECREF(self);
7558 Py_DECREF(str1);
7559 return NULL;
7560 }
Tim Petersced69f82003-09-16 20:30:58 +00007561 result = replace((PyUnicodeObject *)self,
7562 (PyUnicodeObject *)str1,
7563 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 maxcount);
7565 Py_DECREF(self);
7566 Py_DECREF(str1);
7567 Py_DECREF(str2);
7568 return result;
7569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007572"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
7574Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007575old replaced by new. If the optional argument count is\n\
7576given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
7579unicode_replace(PyUnicodeObject *self, PyObject *args)
7580{
7581 PyUnicodeObject *str1;
7582 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 PyObject *result;
7585
Martin v. Löwis18e16552006-02-15 17:27:45 +00007586 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 return NULL;
7588 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7589 if (str1 == NULL)
7590 return NULL;
7591 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007592 if (str2 == NULL) {
7593 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
7597 result = replace(self, str1, str2, maxcount);
7598
7599 Py_DECREF(str1);
7600 Py_DECREF(str2);
7601 return result;
7602}
7603
7604static
7605PyObject *unicode_repr(PyObject *unicode)
7606{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007607 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007608 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007609 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7610 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7611
7612 /* XXX(nnorwitz): rather than over-allocating, it would be
7613 better to choose a different scheme. Perhaps scan the
7614 first N-chars of the string and allocate based on that size.
7615 */
7616 /* Initial allocation is based on the longest-possible unichr
7617 escape.
7618
7619 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7620 unichr, so in this case it's the longest unichr escape. In
7621 narrow (UTF-16) builds this is five chars per source unichr
7622 since there are two unichrs in the surrogate pair, so in narrow
7623 (UTF-16) builds it's not the longest unichr escape.
7624
7625 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7626 so in the narrow (UTF-16) build case it's the longest unichr
7627 escape.
7628 */
7629
Walter Dörwald1ab83302007-05-18 17:15:44 +00007630 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007631 2 /* quotes */
7632#ifdef Py_UNICODE_WIDE
7633 + 10*size
7634#else
7635 + 6*size
7636#endif
7637 + 1);
7638 if (repr == NULL)
7639 return NULL;
7640
Walter Dörwald1ab83302007-05-18 17:15:44 +00007641 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007642
7643 /* Add quote */
7644 *p++ = (findchar(s, size, '\'') &&
7645 !findchar(s, size, '"')) ? '"' : '\'';
7646 while (size-- > 0) {
7647 Py_UNICODE ch = *s++;
7648
7649 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007650 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007651 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007652 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007653 continue;
7654 }
7655
Georg Brandl559e5d72008-06-11 18:37:52 +00007656 /* Map special whitespace to '\t', \n', '\r' */
7657 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007658 *p++ = '\\';
7659 *p++ = 't';
7660 }
7661 else if (ch == '\n') {
7662 *p++ = '\\';
7663 *p++ = 'n';
7664 }
7665 else if (ch == '\r') {
7666 *p++ = '\\';
7667 *p++ = 'r';
7668 }
7669
7670 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007671 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007672 *p++ = '\\';
7673 *p++ = 'x';
7674 *p++ = hexdigits[(ch >> 4) & 0x000F];
7675 *p++ = hexdigits[ch & 0x000F];
7676 }
7677
Georg Brandl559e5d72008-06-11 18:37:52 +00007678 /* Copy ASCII characters as-is */
7679 else if (ch < 0x7F) {
7680 *p++ = ch;
7681 }
7682
7683 /* Non-ASCII characters */
7684 else {
7685 Py_UCS4 ucs = ch;
7686
7687#ifndef Py_UNICODE_WIDE
7688 Py_UNICODE ch2 = 0;
7689 /* Get code point from surrogate pair */
7690 if (size > 0) {
7691 ch2 = *s;
7692 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7693 && ch2 <= 0xDFFF) {
7694 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7695 + 0x00010000;
7696 s++;
7697 size--;
7698 }
7699 }
7700#endif
7701 /* Map Unicode whitespace and control characters
7702 (categories Z* and C* except ASCII space)
7703 */
7704 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7705 /* Map 8-bit characters to '\xhh' */
7706 if (ucs <= 0xff) {
7707 *p++ = '\\';
7708 *p++ = 'x';
7709 *p++ = hexdigits[(ch >> 4) & 0x000F];
7710 *p++ = hexdigits[ch & 0x000F];
7711 }
7712 /* Map 21-bit characters to '\U00xxxxxx' */
7713 else if (ucs >= 0x10000) {
7714 *p++ = '\\';
7715 *p++ = 'U';
7716 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7717 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7718 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7719 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7720 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7721 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7722 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7723 *p++ = hexdigits[ucs & 0x0000000F];
7724 }
7725 /* Map 16-bit characters to '\uxxxx' */
7726 else {
7727 *p++ = '\\';
7728 *p++ = 'u';
7729 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7730 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7731 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7732 *p++ = hexdigits[ucs & 0x000F];
7733 }
7734 }
7735 /* Copy characters as-is */
7736 else {
7737 *p++ = ch;
7738#ifndef Py_UNICODE_WIDE
7739 if (ucs >= 0x10000)
7740 *p++ = ch2;
7741#endif
7742 }
7743 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007744 }
7745 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007746 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007747
7748 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007749 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007750 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751}
7752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007753PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007754"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755\n\
7756Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007757such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758arguments start and end are interpreted as in slice notation.\n\
7759\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761
7762static PyObject *
7763unicode_rfind(PyUnicodeObject *self, PyObject *args)
7764{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007765 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007766 Py_ssize_t start;
7767 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007768 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
Christian Heimes9cd17752007-11-18 19:35:23 +00007770 if (!_ParseTupleFinds(args, &substring, &start, &end))
7771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
Thomas Wouters477c8d52006-05-27 19:21:47 +00007773 result = stringlib_rfind_slice(
7774 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7775 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7776 start, end
7777 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
7779 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007780
Christian Heimes217cfd12007-12-02 14:31:20 +00007781 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782}
7783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007785"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
7789static PyObject *
7790unicode_rindex(PyUnicodeObject *self, PyObject *args)
7791{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007792 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007793 Py_ssize_t start;
7794 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007795 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
Christian Heimes9cd17752007-11-18 19:35:23 +00007797 if (!_ParseTupleFinds(args, &substring, &start, &end))
7798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
Thomas Wouters477c8d52006-05-27 19:21:47 +00007800 result = stringlib_rfind_slice(
7801 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7802 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7803 start, end
7804 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
7806 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007807
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 if (result < 0) {
7809 PyErr_SetString(PyExc_ValueError, "substring not found");
7810 return NULL;
7811 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007812 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813}
7814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007815PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007816"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007818Return S right justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007819done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820
7821static PyObject *
7822unicode_rjust(PyUnicodeObject *self, PyObject *args)
7823{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007824 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007825 Py_UNICODE fillchar = ' ';
7826
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007827 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 return NULL;
7829
Tim Peters7a29bd52001-09-12 03:03:31 +00007830 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 Py_INCREF(self);
7832 return (PyObject*) self;
7833 }
7834
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007835 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836}
7837
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838PyObject *PyUnicode_Split(PyObject *s,
7839 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841{
7842 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007843
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 s = PyUnicode_FromObject(s);
7845 if (s == NULL)
7846 return NULL;
7847 if (sep != NULL) {
7848 sep = PyUnicode_FromObject(sep);
7849 if (sep == NULL) {
7850 Py_DECREF(s);
7851 return NULL;
7852 }
7853 }
7854
7855 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7856
7857 Py_DECREF(s);
7858 Py_XDECREF(sep);
7859 return result;
7860}
7861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007862PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007863"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864\n\
7865Return a list of the words in S, using sep as the\n\
7866delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007867splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007868whitespace string is a separator and empty strings are\n\
7869removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
7871static PyObject*
7872unicode_split(PyUnicodeObject *self, PyObject *args)
7873{
7874 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007875 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876
Martin v. Löwis18e16552006-02-15 17:27:45 +00007877 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 return NULL;
7879
7880 if (substring == Py_None)
7881 return split(self, NULL, maxcount);
7882 else if (PyUnicode_Check(substring))
7883 return split(self, (PyUnicodeObject *)substring, maxcount);
7884 else
7885 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7886}
7887
Thomas Wouters477c8d52006-05-27 19:21:47 +00007888PyObject *
7889PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7890{
7891 PyObject* str_obj;
7892 PyObject* sep_obj;
7893 PyObject* out;
7894
7895 str_obj = PyUnicode_FromObject(str_in);
7896 if (!str_obj)
7897 return NULL;
7898 sep_obj = PyUnicode_FromObject(sep_in);
7899 if (!sep_obj) {
7900 Py_DECREF(str_obj);
7901 return NULL;
7902 }
7903
7904 out = stringlib_partition(
7905 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7906 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7907 );
7908
7909 Py_DECREF(sep_obj);
7910 Py_DECREF(str_obj);
7911
7912 return out;
7913}
7914
7915
7916PyObject *
7917PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7918{
7919 PyObject* str_obj;
7920 PyObject* sep_obj;
7921 PyObject* out;
7922
7923 str_obj = PyUnicode_FromObject(str_in);
7924 if (!str_obj)
7925 return NULL;
7926 sep_obj = PyUnicode_FromObject(sep_in);
7927 if (!sep_obj) {
7928 Py_DECREF(str_obj);
7929 return NULL;
7930 }
7931
7932 out = stringlib_rpartition(
7933 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7934 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7935 );
7936
7937 Py_DECREF(sep_obj);
7938 Py_DECREF(str_obj);
7939
7940 return out;
7941}
7942
7943PyDoc_STRVAR(partition__doc__,
7944"S.partition(sep) -> (head, sep, tail)\n\
7945\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007946Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947the separator itself, and the part after it. If the separator is not\n\
7948found, returns S and two empty strings.");
7949
7950static PyObject*
7951unicode_partition(PyUnicodeObject *self, PyObject *separator)
7952{
7953 return PyUnicode_Partition((PyObject *)self, separator);
7954}
7955
7956PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007957"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007958\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007959Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007961separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962
7963static PyObject*
7964unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7965{
7966 return PyUnicode_RPartition((PyObject *)self, separator);
7967}
7968
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007969PyObject *PyUnicode_RSplit(PyObject *s,
7970 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007971 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007972{
7973 PyObject *result;
7974
7975 s = PyUnicode_FromObject(s);
7976 if (s == NULL)
7977 return NULL;
7978 if (sep != NULL) {
7979 sep = PyUnicode_FromObject(sep);
7980 if (sep == NULL) {
7981 Py_DECREF(s);
7982 return NULL;
7983 }
7984 }
7985
7986 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7987
7988 Py_DECREF(s);
7989 Py_XDECREF(sep);
7990 return result;
7991}
7992
7993PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007994"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007995\n\
7996Return a list of the words in S, using sep as the\n\
7997delimiter string, starting at the end of the string and\n\
7998working to the front. If maxsplit is given, at most maxsplit\n\
7999splits are done. If sep is not specified, any whitespace string\n\
8000is a separator.");
8001
8002static PyObject*
8003unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8004{
8005 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008006 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008007
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008009 return NULL;
8010
8011 if (substring == Py_None)
8012 return rsplit(self, NULL, maxcount);
8013 else if (PyUnicode_Check(substring))
8014 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8015 else
8016 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8017}
8018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008019PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00008020"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021\n\
8022Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008023Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008024is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
8026static PyObject*
8027unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8028{
Guido van Rossum86662912000-04-11 15:38:46 +00008029 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Guido van Rossum86662912000-04-11 15:38:46 +00008031 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 return NULL;
8033
Guido van Rossum86662912000-04-11 15:38:46 +00008034 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035}
8036
8037static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008038PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
Walter Dörwald346737f2007-05-31 10:44:43 +00008040 if (PyUnicode_CheckExact(self)) {
8041 Py_INCREF(self);
8042 return self;
8043 } else
8044 /* Subtype -- return genuine unicode string with the same value. */
8045 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8046 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047}
8048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008049PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008050"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051\n\
8052Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008053and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
8055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008056unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 return fixup(self, fixswapcase);
8059}
8060
Georg Brandlceee0772007-11-27 23:48:05 +00008061PyDoc_STRVAR(maketrans__doc__,
8062"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8063\n\
8064Return a translation table usable for str.translate().\n\
8065If there is only one argument, it must be a dictionary mapping Unicode\n\
8066ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008067Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008068If there are two arguments, they must be strings of equal length, and\n\
8069in the resulting dictionary, each character in x will be mapped to the\n\
8070character at the same position in y. If there is a third argument, it\n\
8071must be a string, whose characters will be mapped to None in the result.");
8072
8073static PyObject*
8074unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8075{
8076 PyObject *x, *y = NULL, *z = NULL;
8077 PyObject *new = NULL, *key, *value;
8078 Py_ssize_t i = 0;
8079 int res;
8080
8081 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8082 return NULL;
8083 new = PyDict_New();
8084 if (!new)
8085 return NULL;
8086 if (y != NULL) {
8087 /* x must be a string too, of equal length */
8088 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8089 if (!PyUnicode_Check(x)) {
8090 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8091 "be a string if there is a second argument");
8092 goto err;
8093 }
8094 if (PyUnicode_GET_SIZE(x) != ylen) {
8095 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8096 "arguments must have equal length");
8097 goto err;
8098 }
8099 /* create entries for translating chars in x to those in y */
8100 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008101 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8102 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008103 if (!key || !value)
8104 goto err;
8105 res = PyDict_SetItem(new, key, value);
8106 Py_DECREF(key);
8107 Py_DECREF(value);
8108 if (res < 0)
8109 goto err;
8110 }
8111 /* create entries for deleting chars in z */
8112 if (z != NULL) {
8113 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008114 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008115 if (!key)
8116 goto err;
8117 res = PyDict_SetItem(new, key, Py_None);
8118 Py_DECREF(key);
8119 if (res < 0)
8120 goto err;
8121 }
8122 }
8123 } else {
8124 /* x must be a dict */
8125 if (!PyDict_Check(x)) {
8126 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8127 "to maketrans it must be a dict");
8128 goto err;
8129 }
8130 /* copy entries into the new dict, converting string keys to int keys */
8131 while (PyDict_Next(x, &i, &key, &value)) {
8132 if (PyUnicode_Check(key)) {
8133 /* convert string keys to integer keys */
8134 PyObject *newkey;
8135 if (PyUnicode_GET_SIZE(key) != 1) {
8136 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8137 "table must be of length 1");
8138 goto err;
8139 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008140 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008141 if (!newkey)
8142 goto err;
8143 res = PyDict_SetItem(new, newkey, value);
8144 Py_DECREF(newkey);
8145 if (res < 0)
8146 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008147 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008148 /* just keep integer keys */
8149 if (PyDict_SetItem(new, key, value) < 0)
8150 goto err;
8151 } else {
8152 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8153 "be strings or integers");
8154 goto err;
8155 }
8156 }
8157 }
8158 return new;
8159 err:
8160 Py_DECREF(new);
8161 return NULL;
8162}
8163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008164PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008165"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166\n\
8167Return a copy of the string S, where all characters have been mapped\n\
8168through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008169Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008170Unmapped characters are left untouched. Characters mapped to None\n\
8171are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
8173static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008174unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175{
Georg Brandlceee0772007-11-27 23:48:05 +00008176 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177}
8178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008179PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008180"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008182Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183
8184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008185unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 return fixup(self, fixupper);
8188}
8189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008190PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008191"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008193Pad a numeric string S with zeros on the left, to fill a field\n\
8194of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
8196static PyObject *
8197unicode_zfill(PyUnicodeObject *self, PyObject *args)
8198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008199 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 PyUnicodeObject *u;
8201
Martin v. Löwis18e16552006-02-15 17:27:45 +00008202 Py_ssize_t width;
8203 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 return NULL;
8205
8206 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008207 if (PyUnicode_CheckExact(self)) {
8208 Py_INCREF(self);
8209 return (PyObject*) self;
8210 }
8211 else
8212 return PyUnicode_FromUnicode(
8213 PyUnicode_AS_UNICODE(self),
8214 PyUnicode_GET_SIZE(self)
8215 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 }
8217
8218 fill = width - self->length;
8219
8220 u = pad(self, fill, 0, '0');
8221
Walter Dörwald068325e2002-04-15 13:36:47 +00008222 if (u == NULL)
8223 return NULL;
8224
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 if (u->str[fill] == '+' || u->str[fill] == '-') {
8226 /* move sign to beginning of string */
8227 u->str[0] = u->str[fill];
8228 u->str[fill] = '0';
8229 }
8230
8231 return (PyObject*) u;
8232}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
8234#if 0
8235static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008236unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237{
Christian Heimes2202f872008-02-06 14:31:34 +00008238 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239}
8240#endif
8241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008242PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008243"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008245Return True if S starts with the specified prefix, False otherwise.\n\
8246With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008247With optional end, stop comparing S at that position.\n\
8248prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
8250static PyObject *
8251unicode_startswith(PyUnicodeObject *self,
8252 PyObject *args)
8253{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008254 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008256 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008257 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008258 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008260 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008261 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008263 if (PyTuple_Check(subobj)) {
8264 Py_ssize_t i;
8265 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8266 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8267 PyTuple_GET_ITEM(subobj, i));
8268 if (substring == NULL)
8269 return NULL;
8270 result = tailmatch(self, substring, start, end, -1);
8271 Py_DECREF(substring);
8272 if (result) {
8273 Py_RETURN_TRUE;
8274 }
8275 }
8276 /* nothing matched */
8277 Py_RETURN_FALSE;
8278 }
8279 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008281 return NULL;
8282 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008284 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285}
8286
8287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008288PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008289"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008291Return True if S ends with the specified suffix, False otherwise.\n\
8292With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008293With optional end, stop comparing S at that position.\n\
8294suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
8296static PyObject *
8297unicode_endswith(PyUnicodeObject *self,
8298 PyObject *args)
8299{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008300 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008303 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008304 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008306 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8307 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008309 if (PyTuple_Check(subobj)) {
8310 Py_ssize_t i;
8311 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8312 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8313 PyTuple_GET_ITEM(subobj, i));
8314 if (substring == NULL)
8315 return NULL;
8316 result = tailmatch(self, substring, start, end, +1);
8317 Py_DECREF(substring);
8318 if (result) {
8319 Py_RETURN_TRUE;
8320 }
8321 }
8322 Py_RETURN_FALSE;
8323 }
8324 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008328 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008330 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331}
8332
Eric Smith8c663262007-08-25 02:26:07 +00008333#include "stringlib/string_format.h"
8334
8335PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008336"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008337\n\
8338");
8339
Eric Smith4a7d76d2008-05-30 18:10:19 +00008340static PyObject *
8341unicode__format__(PyObject* self, PyObject* args)
8342{
8343 PyObject *format_spec;
8344
8345 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8346 return NULL;
8347
8348 return _PyUnicode_FormatAdvanced(self,
8349 PyUnicode_AS_UNICODE(format_spec),
8350 PyUnicode_GET_SIZE(format_spec));
8351}
8352
Eric Smith8c663262007-08-25 02:26:07 +00008353PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008354"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008355\n\
8356");
8357
8358static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008359unicode__sizeof__(PyUnicodeObject *v)
8360{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008361 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8362 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008363}
8364
8365PyDoc_STRVAR(sizeof__doc__,
8366"S.__sizeof__() -> size of S in memory, in bytes");
8367
8368static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008369unicode_getnewargs(PyUnicodeObject *v)
8370{
8371 return Py_BuildValue("(u#)", v->str, v->length);
8372}
8373
8374
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375static PyMethodDef unicode_methods[] = {
8376
8377 /* Order is according to common usage: often used methods should
8378 appear first, since lookup is done sequentially. */
8379
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008380 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8381 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8382 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008383 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008384 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8385 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8386 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8387 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8388 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8389 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8390 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008391 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008392 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8393 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8394 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008395 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008396 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8397 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8398 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008399 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008400 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008401 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008402 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008403 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8404 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8405 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8406 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8407 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8408 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8409 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8410 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8411 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8412 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8413 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8414 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8415 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8416 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008417 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008418 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008419 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008420 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008421 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008422 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8423 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008424 {"maketrans", (PyCFunction) unicode_maketrans,
8425 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008426 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008427#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008428 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429#endif
8430
8431#if 0
8432 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008433 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434#endif
8435
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008436 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 {NULL, NULL}
8438};
8439
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008440static PyObject *
8441unicode_mod(PyObject *v, PyObject *w)
8442{
8443 if (!PyUnicode_Check(v)) {
8444 Py_INCREF(Py_NotImplemented);
8445 return Py_NotImplemented;
8446 }
8447 return PyUnicode_Format(v, w);
8448}
8449
8450static PyNumberMethods unicode_as_number = {
8451 0, /*nb_add*/
8452 0, /*nb_subtract*/
8453 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008454 unicode_mod, /*nb_remainder*/
8455};
8456
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008458 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008459 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008460 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8461 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008462 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 0, /* sq_ass_item */
8464 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008465 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466};
8467
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008468static PyObject*
8469unicode_subscript(PyUnicodeObject* self, PyObject* item)
8470{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008471 if (PyIndex_Check(item)) {
8472 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008473 if (i == -1 && PyErr_Occurred())
8474 return NULL;
8475 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008476 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008477 return unicode_getitem(self, i);
8478 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008479 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008480 Py_UNICODE* source_buf;
8481 Py_UNICODE* result_buf;
8482 PyObject* result;
8483
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008484 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008485 &start, &stop, &step, &slicelength) < 0) {
8486 return NULL;
8487 }
8488
8489 if (slicelength <= 0) {
8490 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008491 } else if (start == 0 && step == 1 && slicelength == self->length &&
8492 PyUnicode_CheckExact(self)) {
8493 Py_INCREF(self);
8494 return (PyObject *)self;
8495 } else if (step == 1) {
8496 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008497 } else {
8498 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008499 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8500 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008501
8502 if (result_buf == NULL)
8503 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008504
8505 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8506 result_buf[i] = source_buf[cur];
8507 }
Tim Petersced69f82003-09-16 20:30:58 +00008508
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008509 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008510 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008511 return result;
8512 }
8513 } else {
8514 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8515 return NULL;
8516 }
8517}
8518
8519static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008520 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008521 (binaryfunc)unicode_subscript, /* mp_subscript */
8522 (objobjargproc)0, /* mp_ass_subscript */
8523};
8524
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526/* Helpers for PyUnicode_Format() */
8527
8528static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008529getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008531 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 if (argidx < arglen) {
8533 (*p_argidx)++;
8534 if (arglen < 0)
8535 return args;
8536 else
8537 return PyTuple_GetItem(args, argidx);
8538 }
8539 PyErr_SetString(PyExc_TypeError,
8540 "not enough arguments for format string");
8541 return NULL;
8542}
8543
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008545strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008547 register Py_ssize_t i;
8548 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 for (i = len - 1; i >= 0; i--)
8550 buffer[i] = (Py_UNICODE) charbuffer[i];
8551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 return len;
8553}
8554
Neal Norwitzfc76d632006-01-10 06:03:13 +00008555static int
8556doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8557{
Tim Peters15231542006-02-16 01:08:01 +00008558 Py_ssize_t result;
8559
Neal Norwitzfc76d632006-01-10 06:03:13 +00008560 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008561 result = strtounicode(buffer, (char *)buffer);
8562 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008563}
8564
Christian Heimes3fd13992008-03-21 01:05:49 +00008565#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008566static int
8567longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8568{
Tim Peters15231542006-02-16 01:08:01 +00008569 Py_ssize_t result;
8570
Neal Norwitzfc76d632006-01-10 06:03:13 +00008571 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008572 result = strtounicode(buffer, (char *)buffer);
8573 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008574}
Christian Heimes3fd13992008-03-21 01:05:49 +00008575#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008576
Guido van Rossum078151d2002-08-11 04:24:12 +00008577/* XXX To save some code duplication, formatfloat/long/int could have been
8578 shared with stringobject.c, converting from 8-bit to Unicode after the
8579 formatting is done. */
8580
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581static int
8582formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008583 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 int flags,
8585 int prec,
8586 int type,
8587 PyObject *v)
8588{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008589 /* fmt = '%#.' + `prec` + `type`
8590 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 char fmt[20];
8592 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008593
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 x = PyFloat_AsDouble(v);
8595 if (x == -1.0 && PyErr_Occurred())
8596 return -1;
8597 if (prec < 0)
8598 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008599 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8600 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008601 /* Worst case length calc to ensure no buffer overrun:
8602
8603 'g' formats:
8604 fmt = %#.<prec>g
8605 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8606 for any double rep.)
8607 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8608
8609 'f' formats:
8610 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8611 len = 1 + 50 + 1 + prec = 52 + prec
8612
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008613 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008614 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008615
8616 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008617 if (((type == 'g' || type == 'G') &&
8618 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008619 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008620 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008621 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008622 return -1;
8623 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008624 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8625 (flags&F_ALT) ? "#" : "",
8626 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008627 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Tim Peters38fd5b62000-09-21 05:43:11 +00008630static PyObject*
8631formatlong(PyObject *val, int flags, int prec, int type)
8632{
8633 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008634 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008635 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008636 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008637
Christian Heimes72b710a2008-05-26 13:28:38 +00008638 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008639 if (!str)
8640 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008641 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008642 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008643 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008644}
8645
Christian Heimes3fd13992008-03-21 01:05:49 +00008646#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647static int
8648formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008649 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 int flags,
8651 int prec,
8652 int type,
8653 PyObject *v)
8654{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008655 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008656 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8657 * + 1 + 1
8658 * = 24
8659 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008660 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008661 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 long x;
8663
Christian Heimes217cfd12007-12-02 14:31:20 +00008664 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008666 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008667 if (x < 0 && type == 'u') {
8668 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008669 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008670 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8671 sign = "-";
8672 else
8673 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008675 prec = 1;
8676
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008677 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8678 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008680 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008681 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008682 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008683 return -1;
8684 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008685
8686 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008687 (type == 'x' || type == 'X' || type == 'o')) {
8688 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008689 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008690 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008691 * - when 0 is being converted, the C standard leaves off
8692 * the '0x' or '0X', which is inconsistent with other
8693 * %#x/%#X conversions and inconsistent with Python's
8694 * hex() function
8695 * - there are platforms that violate the standard and
8696 * convert 0 with the '0x' or '0X'
8697 * (Metrowerks, Compaq Tru64)
8698 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008699 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008700 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008701 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008702 * We can achieve the desired consistency by inserting our
8703 * own '0x' or '0X' prefix, and substituting %x/%X in place
8704 * of %#x/%#X.
8705 *
8706 * Note that this is the same approach as used in
8707 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008708 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008709 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8710 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008711 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008712 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008713 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8714 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008715 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008716 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008717 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008718 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008719 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008720 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721}
Christian Heimes3fd13992008-03-21 01:05:49 +00008722#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723
8724static int
8725formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008726 size_t buflen,
8727 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008729 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008730 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008731 if (PyUnicode_GET_SIZE(v) == 1) {
8732 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8733 buf[1] = '\0';
8734 return 1;
8735 }
8736#ifndef Py_UNICODE_WIDE
8737 if (PyUnicode_GET_SIZE(v) == 2) {
8738 /* Decode a valid surrogate pair */
8739 int c0 = PyUnicode_AS_UNICODE(v)[0];
8740 int c1 = PyUnicode_AS_UNICODE(v)[1];
8741 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8742 0xDC00 <= c1 && c1 <= 0xDFFF) {
8743 buf[0] = c0;
8744 buf[1] = c1;
8745 buf[2] = '\0';
8746 return 2;
8747 }
8748 }
8749#endif
8750 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 else {
8753 /* Integer input truncated to a character */
8754 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008755 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008757 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008758
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008759 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008760 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008761 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008762 return -1;
8763 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008764
8765#ifndef Py_UNICODE_WIDE
8766 if (x > 0xffff) {
8767 x -= 0x10000;
8768 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8769 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8770 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008771 }
8772#endif
8773 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008774 buf[1] = '\0';
8775 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008777
8778 onError:
8779 PyErr_SetString(PyExc_TypeError,
8780 "%c requires int or char");
8781 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008784/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8785
8786 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8787 chars are formatted. XXX This is a magic number. Each formatting
8788 routine does bounds checking to ensure no overflow, but a better
8789 solution may be to malloc a buffer of appropriate size for each
8790 format. For now, the current solution is sufficient.
8791*/
8792#define FORMATBUFLEN (size_t)120
8793
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794PyObject *PyUnicode_Format(PyObject *format,
8795 PyObject *args)
8796{
8797 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008798 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 int args_owned = 0;
8800 PyUnicodeObject *result = NULL;
8801 PyObject *dict = NULL;
8802 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008803
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 if (format == NULL || args == NULL) {
8805 PyErr_BadInternalCall();
8806 return NULL;
8807 }
8808 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008809 if (uformat == NULL)
8810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811 fmt = PyUnicode_AS_UNICODE(uformat);
8812 fmtcnt = PyUnicode_GET_SIZE(uformat);
8813
8814 reslen = rescnt = fmtcnt + 100;
8815 result = _PyUnicode_New(reslen);
8816 if (result == NULL)
8817 goto onError;
8818 res = PyUnicode_AS_UNICODE(result);
8819
8820 if (PyTuple_Check(args)) {
8821 arglen = PyTuple_Size(args);
8822 argidx = 0;
8823 }
8824 else {
8825 arglen = -1;
8826 argidx = -2;
8827 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008828 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008829 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 dict = args;
8831
8832 while (--fmtcnt >= 0) {
8833 if (*fmt != '%') {
8834 if (--rescnt < 0) {
8835 rescnt = fmtcnt + 100;
8836 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008837 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8840 --rescnt;
8841 }
8842 *res++ = *fmt++;
8843 }
8844 else {
8845 /* Got a format specifier */
8846 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008847 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 Py_UNICODE c = '\0';
8850 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008851 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 PyObject *v = NULL;
8853 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008854 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008856 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008857 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858
8859 fmt++;
8860 if (*fmt == '(') {
8861 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008862 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 PyObject *key;
8864 int pcount = 1;
8865
8866 if (dict == NULL) {
8867 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008868 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 goto onError;
8870 }
8871 ++fmt;
8872 --fmtcnt;
8873 keystart = fmt;
8874 /* Skip over balanced parentheses */
8875 while (pcount > 0 && --fmtcnt >= 0) {
8876 if (*fmt == ')')
8877 --pcount;
8878 else if (*fmt == '(')
8879 ++pcount;
8880 fmt++;
8881 }
8882 keylen = fmt - keystart - 1;
8883 if (fmtcnt < 0 || pcount > 0) {
8884 PyErr_SetString(PyExc_ValueError,
8885 "incomplete format key");
8886 goto onError;
8887 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008888#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008889 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 then looked up since Python uses strings to hold
8891 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008892 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 key = PyUnicode_EncodeUTF8(keystart,
8894 keylen,
8895 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008896#else
8897 key = PyUnicode_FromUnicode(keystart, keylen);
8898#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 if (key == NULL)
8900 goto onError;
8901 if (args_owned) {
8902 Py_DECREF(args);
8903 args_owned = 0;
8904 }
8905 args = PyObject_GetItem(dict, key);
8906 Py_DECREF(key);
8907 if (args == NULL) {
8908 goto onError;
8909 }
8910 args_owned = 1;
8911 arglen = -1;
8912 argidx = -2;
8913 }
8914 while (--fmtcnt >= 0) {
8915 switch (c = *fmt++) {
8916 case '-': flags |= F_LJUST; continue;
8917 case '+': flags |= F_SIGN; continue;
8918 case ' ': flags |= F_BLANK; continue;
8919 case '#': flags |= F_ALT; continue;
8920 case '0': flags |= F_ZERO; continue;
8921 }
8922 break;
8923 }
8924 if (c == '*') {
8925 v = getnextarg(args, arglen, &argidx);
8926 if (v == NULL)
8927 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008928 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 PyErr_SetString(PyExc_TypeError,
8930 "* wants int");
8931 goto onError;
8932 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008933 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008934 if (width == -1 && PyErr_Occurred())
8935 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 if (width < 0) {
8937 flags |= F_LJUST;
8938 width = -width;
8939 }
8940 if (--fmtcnt >= 0)
8941 c = *fmt++;
8942 }
8943 else if (c >= '0' && c <= '9') {
8944 width = c - '0';
8945 while (--fmtcnt >= 0) {
8946 c = *fmt++;
8947 if (c < '0' || c > '9')
8948 break;
8949 if ((width*10) / 10 != width) {
8950 PyErr_SetString(PyExc_ValueError,
8951 "width too big");
8952 goto onError;
8953 }
8954 width = width*10 + (c - '0');
8955 }
8956 }
8957 if (c == '.') {
8958 prec = 0;
8959 if (--fmtcnt >= 0)
8960 c = *fmt++;
8961 if (c == '*') {
8962 v = getnextarg(args, arglen, &argidx);
8963 if (v == NULL)
8964 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008965 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 PyErr_SetString(PyExc_TypeError,
8967 "* wants int");
8968 goto onError;
8969 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008970 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008971 if (prec == -1 && PyErr_Occurred())
8972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 if (prec < 0)
8974 prec = 0;
8975 if (--fmtcnt >= 0)
8976 c = *fmt++;
8977 }
8978 else if (c >= '0' && c <= '9') {
8979 prec = c - '0';
8980 while (--fmtcnt >= 0) {
8981 c = Py_CHARMASK(*fmt++);
8982 if (c < '0' || c > '9')
8983 break;
8984 if ((prec*10) / 10 != prec) {
8985 PyErr_SetString(PyExc_ValueError,
8986 "prec too big");
8987 goto onError;
8988 }
8989 prec = prec*10 + (c - '0');
8990 }
8991 }
8992 } /* prec */
8993 if (fmtcnt >= 0) {
8994 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 if (--fmtcnt >= 0)
8996 c = *fmt++;
8997 }
8998 }
8999 if (fmtcnt < 0) {
9000 PyErr_SetString(PyExc_ValueError,
9001 "incomplete format");
9002 goto onError;
9003 }
9004 if (c != '%') {
9005 v = getnextarg(args, arglen, &argidx);
9006 if (v == NULL)
9007 goto onError;
9008 }
9009 sign = 0;
9010 fill = ' ';
9011 switch (c) {
9012
9013 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009014 pbuf = formatbuf;
9015 /* presume that buffer length is at least 1 */
9016 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 len = 1;
9018 break;
9019
9020 case 's':
9021 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009022 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 if (PyUnicode_Check(v) && c == 's') {
9024 temp = v;
9025 Py_INCREF(temp);
9026 }
9027 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009029 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009030 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009032 else
9033 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 if (temp == NULL)
9035 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009036 if (PyUnicode_Check(temp))
9037 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009038 else {
9039 Py_DECREF(temp);
9040 PyErr_SetString(PyExc_TypeError,
9041 "%s argument has non-string str()");
9042 goto onError;
9043 }
9044 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009045 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 len = PyUnicode_GET_SIZE(temp);
9047 if (prec >= 0 && len > prec)
9048 len = prec;
9049 break;
9050
9051 case 'i':
9052 case 'd':
9053 case 'u':
9054 case 'o':
9055 case 'x':
9056 case 'X':
9057 if (c == 'i')
9058 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009059 isnumok = 0;
9060 if (PyNumber_Check(v)) {
9061 PyObject *iobj=NULL;
9062
9063 if (PyLong_Check(v)) {
9064 iobj = v;
9065 Py_INCREF(iobj);
9066 }
9067 else {
9068 iobj = PyNumber_Long(v);
9069 }
9070 if (iobj!=NULL) {
9071 if (PyLong_Check(iobj)) {
9072 isnumok = 1;
9073 temp = formatlong(iobj, flags, prec, c);
9074 Py_DECREF(iobj);
9075 if (!temp)
9076 goto onError;
9077 pbuf = PyUnicode_AS_UNICODE(temp);
9078 len = PyUnicode_GET_SIZE(temp);
9079 sign = 1;
9080 }
9081 else {
9082 Py_DECREF(iobj);
9083 }
9084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009086 if (!isnumok) {
9087 PyErr_Format(PyExc_TypeError,
9088 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009089 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009090 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009091 }
9092 if (flags & F_ZERO)
9093 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 break;
9095
9096 case 'e':
9097 case 'E':
9098 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009099 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 case 'g':
9101 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009102 if (c == 'F')
9103 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009104 pbuf = formatbuf;
9105 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9106 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 if (len < 0)
9108 goto onError;
9109 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009110 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 fill = '0';
9112 break;
9113
9114 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009115 pbuf = formatbuf;
9116 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 if (len < 0)
9118 goto onError;
9119 break;
9120
9121 default:
9122 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009123 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009124 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009125 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009126 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009127 (Py_ssize_t)(fmt - 1 -
9128 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 goto onError;
9130 }
9131 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009132 if (*pbuf == '-' || *pbuf == '+') {
9133 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 len--;
9135 }
9136 else if (flags & F_SIGN)
9137 sign = '+';
9138 else if (flags & F_BLANK)
9139 sign = ' ';
9140 else
9141 sign = 0;
9142 }
9143 if (width < len)
9144 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009145 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 reslen -= rescnt;
9147 rescnt = width + fmtcnt + 100;
9148 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009149 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009150 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009151 PyErr_NoMemory();
9152 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009153 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009154 if (_PyUnicode_Resize(&result, reslen) < 0) {
9155 Py_XDECREF(temp);
9156 goto onError;
9157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 res = PyUnicode_AS_UNICODE(result)
9159 + reslen - rescnt;
9160 }
9161 if (sign) {
9162 if (fill != ' ')
9163 *res++ = sign;
9164 rescnt--;
9165 if (width > len)
9166 width--;
9167 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009168 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009169 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009170 assert(pbuf[1] == c);
9171 if (fill != ' ') {
9172 *res++ = *pbuf++;
9173 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009174 }
Tim Petersfff53252001-04-12 18:38:48 +00009175 rescnt -= 2;
9176 width -= 2;
9177 if (width < 0)
9178 width = 0;
9179 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 if (width > len && !(flags & F_LJUST)) {
9182 do {
9183 --rescnt;
9184 *res++ = fill;
9185 } while (--width > len);
9186 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009187 if (fill == ' ') {
9188 if (sign)
9189 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009190 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009191 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009192 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009193 *res++ = *pbuf++;
9194 *res++ = *pbuf++;
9195 }
9196 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009197 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 res += len;
9199 rescnt -= len;
9200 while (--width >= len) {
9201 --rescnt;
9202 *res++ = ' ';
9203 }
9204 if (dict && (argidx < arglen) && c != '%') {
9205 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009206 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009207 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 goto onError;
9209 }
9210 Py_XDECREF(temp);
9211 } /* '%' */
9212 } /* until end */
9213 if (argidx < arglen && !dict) {
9214 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009215 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 goto onError;
9217 }
9218
Thomas Woutersa96affe2006-03-12 00:29:36 +00009219 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9220 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 if (args_owned) {
9222 Py_DECREF(args);
9223 }
9224 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 return (PyObject *)result;
9226
9227 onError:
9228 Py_XDECREF(result);
9229 Py_DECREF(uformat);
9230 if (args_owned) {
9231 Py_DECREF(args);
9232 }
9233 return NULL;
9234}
9235
Jeremy Hylton938ace62002-07-17 16:30:39 +00009236static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009237unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9238
Tim Peters6d6c1a32001-08-02 04:15:00 +00009239static PyObject *
9240unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9241{
9242 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009243 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009244 char *encoding = NULL;
9245 char *errors = NULL;
9246
Guido van Rossume023fe02001-08-30 03:12:59 +00009247 if (type != &PyUnicode_Type)
9248 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009249 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009250 kwlist, &x, &encoding, &errors))
9251 return NULL;
9252 if (x == NULL)
9253 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009254 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009255 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009256 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009257 return PyUnicode_FromEncodedObject(x, encoding, errors);
9258}
9259
Guido van Rossume023fe02001-08-30 03:12:59 +00009260static PyObject *
9261unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9262{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009263 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009264 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009265
9266 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9267 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9268 if (tmp == NULL)
9269 return NULL;
9270 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009271 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009272 if (pnew == NULL) {
9273 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009274 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009275 }
Christian Heimesb186d002008-03-18 15:15:01 +00009276 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009277 if (pnew->str == NULL) {
9278 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009279 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009280 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009281 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009282 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009283 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9284 pnew->length = n;
9285 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009286 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009287 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009288}
9289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009290PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009291"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009292\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009293Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009294encoding defaults to the current default string encoding.\n\
9295errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009296
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009297static PyObject *unicode_iter(PyObject *seq);
9298
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009300 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009301 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 sizeof(PyUnicodeObject), /* tp_size */
9303 0, /* tp_itemsize */
9304 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009305 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009307 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009309 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009310 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009311 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 (hashfunc) unicode_hash, /* tp_hash*/
9315 0, /* tp_call*/
9316 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009317 PyObject_GenericGetAttr, /* tp_getattro */
9318 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009319 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009320 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9321 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009322 unicode_doc, /* tp_doc */
9323 0, /* tp_traverse */
9324 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009325 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009326 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009327 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009328 0, /* tp_iternext */
9329 unicode_methods, /* tp_methods */
9330 0, /* tp_members */
9331 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009332 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009333 0, /* tp_dict */
9334 0, /* tp_descr_get */
9335 0, /* tp_descr_set */
9336 0, /* tp_dictoffset */
9337 0, /* tp_init */
9338 0, /* tp_alloc */
9339 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009340 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341};
9342
9343/* Initialize the Unicode implementation */
9344
Thomas Wouters78890102000-07-22 19:25:51 +00009345void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009347 int i;
9348
Thomas Wouters477c8d52006-05-27 19:21:47 +00009349 /* XXX - move this array to unicodectype.c ? */
9350 Py_UNICODE linebreak[] = {
9351 0x000A, /* LINE FEED */
9352 0x000D, /* CARRIAGE RETURN */
9353 0x001C, /* FILE SEPARATOR */
9354 0x001D, /* GROUP SEPARATOR */
9355 0x001E, /* RECORD SEPARATOR */
9356 0x0085, /* NEXT LINE */
9357 0x2028, /* LINE SEPARATOR */
9358 0x2029, /* PARAGRAPH SEPARATOR */
9359 };
9360
Fred Drakee4315f52000-05-09 19:53:39 +00009361 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009362 free_list = NULL;
9363 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009365 if (!unicode_empty)
9366 return;
9367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009368 for (i = 0; i < 256; i++)
9369 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009370 if (PyType_Ready(&PyUnicode_Type) < 0)
9371 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009372
9373 /* initialize the linebreak bloom filter */
9374 bloom_linebreak = make_bloom_mask(
9375 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9376 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009377
9378 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379}
9380
9381/* Finalize the Unicode implementation */
9382
Christian Heimesa156e092008-02-16 07:38:31 +00009383int
9384PyUnicode_ClearFreeList(void)
9385{
9386 int freelist_size = numfree;
9387 PyUnicodeObject *u;
9388
9389 for (u = free_list; u != NULL;) {
9390 PyUnicodeObject *v = u;
9391 u = *(PyUnicodeObject **)u;
9392 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009393 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009394 Py_XDECREF(v->defenc);
9395 PyObject_Del(v);
9396 numfree--;
9397 }
9398 free_list = NULL;
9399 assert(numfree == 0);
9400 return freelist_size;
9401}
9402
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403void
Thomas Wouters78890102000-07-22 19:25:51 +00009404_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009406 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009408 Py_XDECREF(unicode_empty);
9409 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009410
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009411 for (i = 0; i < 256; i++) {
9412 if (unicode_latin1[i]) {
9413 Py_DECREF(unicode_latin1[i]);
9414 unicode_latin1[i] = NULL;
9415 }
9416 }
Christian Heimesa156e092008-02-16 07:38:31 +00009417 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009419
Walter Dörwald16807132007-05-25 13:52:07 +00009420void
9421PyUnicode_InternInPlace(PyObject **p)
9422{
9423 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9424 PyObject *t;
9425 if (s == NULL || !PyUnicode_Check(s))
9426 Py_FatalError(
9427 "PyUnicode_InternInPlace: unicode strings only please!");
9428 /* If it's a subclass, we don't really know what putting
9429 it in the interned dict might do. */
9430 if (!PyUnicode_CheckExact(s))
9431 return;
9432 if (PyUnicode_CHECK_INTERNED(s))
9433 return;
9434 if (interned == NULL) {
9435 interned = PyDict_New();
9436 if (interned == NULL) {
9437 PyErr_Clear(); /* Don't leave an exception */
9438 return;
9439 }
9440 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009441 /* It might be that the GetItem call fails even
9442 though the key is present in the dictionary,
9443 namely when this happens during a stack overflow. */
9444 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009445 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009446 Py_END_ALLOW_RECURSION
9447
Walter Dörwald16807132007-05-25 13:52:07 +00009448 if (t) {
9449 Py_INCREF(t);
9450 Py_DECREF(*p);
9451 *p = t;
9452 return;
9453 }
9454
Martin v. Löwis5b222132007-06-10 09:51:05 +00009455 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009456 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9457 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009458 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009459 return;
9460 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009461 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009462 /* The two references in interned are not counted by refcnt.
9463 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009464 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009465 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9466}
9467
9468void
9469PyUnicode_InternImmortal(PyObject **p)
9470{
9471 PyUnicode_InternInPlace(p);
9472 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9473 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9474 Py_INCREF(*p);
9475 }
9476}
9477
9478PyObject *
9479PyUnicode_InternFromString(const char *cp)
9480{
9481 PyObject *s = PyUnicode_FromString(cp);
9482 if (s == NULL)
9483 return NULL;
9484 PyUnicode_InternInPlace(&s);
9485 return s;
9486}
9487
9488void _Py_ReleaseInternedUnicodeStrings(void)
9489{
9490 PyObject *keys;
9491 PyUnicodeObject *s;
9492 Py_ssize_t i, n;
9493 Py_ssize_t immortal_size = 0, mortal_size = 0;
9494
9495 if (interned == NULL || !PyDict_Check(interned))
9496 return;
9497 keys = PyDict_Keys(interned);
9498 if (keys == NULL || !PyList_Check(keys)) {
9499 PyErr_Clear();
9500 return;
9501 }
9502
9503 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9504 detector, interned unicode strings are not forcibly deallocated;
9505 rather, we give them their stolen references back, and then clear
9506 and DECREF the interned dict. */
9507
9508 n = PyList_GET_SIZE(keys);
9509 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9510 n);
9511 for (i = 0; i < n; i++) {
9512 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9513 switch (s->state) {
9514 case SSTATE_NOT_INTERNED:
9515 /* XXX Shouldn't happen */
9516 break;
9517 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009518 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009519 immortal_size += s->length;
9520 break;
9521 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009522 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009523 mortal_size += s->length;
9524 break;
9525 default:
9526 Py_FatalError("Inconsistent interned string state.");
9527 }
9528 s->state = SSTATE_NOT_INTERNED;
9529 }
9530 fprintf(stderr, "total size of all interned strings: "
9531 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9532 "mortal/immortal\n", mortal_size, immortal_size);
9533 Py_DECREF(keys);
9534 PyDict_Clear(interned);
9535 Py_DECREF(interned);
9536 interned = NULL;
9537}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009538
9539
9540/********************* Unicode Iterator **************************/
9541
9542typedef struct {
9543 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009544 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009545 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9546} unicodeiterobject;
9547
9548static void
9549unicodeiter_dealloc(unicodeiterobject *it)
9550{
9551 _PyObject_GC_UNTRACK(it);
9552 Py_XDECREF(it->it_seq);
9553 PyObject_GC_Del(it);
9554}
9555
9556static int
9557unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9558{
9559 Py_VISIT(it->it_seq);
9560 return 0;
9561}
9562
9563static PyObject *
9564unicodeiter_next(unicodeiterobject *it)
9565{
9566 PyUnicodeObject *seq;
9567 PyObject *item;
9568
9569 assert(it != NULL);
9570 seq = it->it_seq;
9571 if (seq == NULL)
9572 return NULL;
9573 assert(PyUnicode_Check(seq));
9574
9575 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009576 item = PyUnicode_FromUnicode(
9577 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009578 if (item != NULL)
9579 ++it->it_index;
9580 return item;
9581 }
9582
9583 Py_DECREF(seq);
9584 it->it_seq = NULL;
9585 return NULL;
9586}
9587
9588static PyObject *
9589unicodeiter_len(unicodeiterobject *it)
9590{
9591 Py_ssize_t len = 0;
9592 if (it->it_seq)
9593 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009594 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009595}
9596
9597PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9598
9599static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009600 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9601 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009602 {NULL, NULL} /* sentinel */
9603};
9604
9605PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009606 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009607 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009608 sizeof(unicodeiterobject), /* tp_basicsize */
9609 0, /* tp_itemsize */
9610 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009611 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009612 0, /* tp_print */
9613 0, /* tp_getattr */
9614 0, /* tp_setattr */
9615 0, /* tp_compare */
9616 0, /* tp_repr */
9617 0, /* tp_as_number */
9618 0, /* tp_as_sequence */
9619 0, /* tp_as_mapping */
9620 0, /* tp_hash */
9621 0, /* tp_call */
9622 0, /* tp_str */
9623 PyObject_GenericGetAttr, /* tp_getattro */
9624 0, /* tp_setattro */
9625 0, /* tp_as_buffer */
9626 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9627 0, /* tp_doc */
9628 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9629 0, /* tp_clear */
9630 0, /* tp_richcompare */
9631 0, /* tp_weaklistoffset */
9632 PyObject_SelfIter, /* tp_iter */
9633 (iternextfunc)unicodeiter_next, /* tp_iternext */
9634 unicodeiter_methods, /* tp_methods */
9635 0,
9636};
9637
9638static PyObject *
9639unicode_iter(PyObject *seq)
9640{
9641 unicodeiterobject *it;
9642
9643 if (!PyUnicode_Check(seq)) {
9644 PyErr_BadInternalCall();
9645 return NULL;
9646 }
9647 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9648 if (it == NULL)
9649 return NULL;
9650 it->it_index = 0;
9651 Py_INCREF(seq);
9652 it->it_seq = (PyUnicodeObject *)seq;
9653 _PyObject_GC_TRACK(it);
9654 return (PyObject *)it;
9655}
9656
Martin v. Löwis5b222132007-06-10 09:51:05 +00009657size_t
9658Py_UNICODE_strlen(const Py_UNICODE *u)
9659{
9660 int res = 0;
9661 while(*u++)
9662 res++;
9663 return res;
9664}
9665
9666Py_UNICODE*
9667Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9668{
9669 Py_UNICODE *u = s1;
9670 while ((*u++ = *s2++));
9671 return s1;
9672}
9673
9674Py_UNICODE*
9675Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9676{
9677 Py_UNICODE *u = s1;
9678 while ((*u++ = *s2++))
9679 if (n-- == 0)
9680 break;
9681 return s1;
9682}
9683
9684int
9685Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9686{
9687 while (*s1 && *s2 && *s1 == *s2)
9688 s1++, s2++;
9689 if (*s1 && *s2)
9690 return (*s1 < *s2) ? -1 : +1;
9691 if (*s1)
9692 return 1;
9693 if (*s2)
9694 return -1;
9695 return 0;
9696}
9697
9698Py_UNICODE*
9699Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9700{
9701 const Py_UNICODE *p;
9702 for (p = s; *p; p++)
9703 if (*p == c)
9704 return (Py_UNICODE*)p;
9705 return NULL;
9706}
9707
9708
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009709#ifdef __cplusplus
9710}
9711#endif
9712
9713
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009714/*
9715Local variables:
9716c-basic-offset: 4
9717indent-tabs-mode: nil
9718End:
9719*/