blob: 847b61df918726077481989db5674ee98c3193fb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Martin v. Löwis18e16552006-02-15 17:27:45 +0000421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422{
423 register PyUnicodeObject *v;
424
425 /* Argument checks */
426 if (unicode == NULL) {
427 PyErr_BadInternalCall();
428 return -1;
429 }
430 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000431 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432 PyErr_BadInternalCall();
433 return -1;
434 }
435
436 /* Resizing unicode_empty and single character objects is not
437 possible since these are being shared. We simply return a fresh
438 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000439 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 (v == unicode_empty || v->length == 1)) {
441 PyUnicodeObject *w = _PyUnicode_New(length);
442 if (w == NULL)
443 return -1;
444 Py_UNICODE_COPY(w->str, v->str,
445 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000446 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 *unicode = (PyObject *)w;
448 return 0;
449 }
450
451 /* Note that we don't have to modify *unicode for unshared Unicode
452 objects, since we can modify them in-place. */
453 return unicode_resize(v, length);
454}
455
456/* Internal API for use in unicodeobject.c only ! */
457#define _PyUnicode_Resize(unicodevar, length) \
458 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
459
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000461 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462{
463 PyUnicodeObject *unicode;
464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000465 /* If the Unicode data is known at construction time, we can apply
466 some optimizations which share commonly used objects. */
467 if (u != NULL) {
468
469 /* Optimization for empty strings */
470 if (size == 0 && unicode_empty != NULL) {
471 Py_INCREF(unicode_empty);
472 return (PyObject *)unicode_empty;
473 }
474
475 /* Single character Unicode objects in the Latin-1 range are
476 shared when using this constructor */
477 if (size == 1 && *u < 256) {
478 unicode = unicode_latin1[*u];
479 if (!unicode) {
480 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 if (!unicode)
482 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000483 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484 unicode_latin1[*u] = unicode;
485 }
486 Py_INCREF(unicode);
487 return (PyObject *)unicode;
488 }
489 }
Tim Petersced69f82003-09-16 20:30:58 +0000490
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the Unicode data into the new object */
496 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000497 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498
499 return (PyObject *)unicode;
500}
501
Walter Dörwaldd2034312007-05-18 16:29:38 +0000502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000503{
504 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000505
506 if (size < 0) {
507 PyErr_SetString(PyExc_SystemError,
508 "Negative size passed to PyUnicode_FromStringAndSize");
509 return NULL;
510 }
511
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000513 some optimizations which share commonly used objects.
514 Also, this means the input must be UTF-8, so fall back to the
515 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (u != NULL) {
517
518 /* Optimization for empty strings */
519 if (size == 0 && unicode_empty != NULL) {
520 Py_INCREF(unicode_empty);
521 return (PyObject *)unicode_empty;
522 }
523
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 /* Single characters are shared when using this constructor.
525 Restrict to ASCII, since the input must be UTF-8. */
526 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000527 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (!unicode) {
529 unicode = _PyUnicode_New(1);
530 if (!unicode)
531 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000532 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000533 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 }
535 Py_INCREF(unicode);
536 return (PyObject *)unicode;
537 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000538
539 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 }
541
Walter Dörwald55507312007-05-18 13:12:10 +0000542 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000543 if (!unicode)
544 return NULL;
545
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000546 return (PyObject *)unicode;
547}
548
Walter Dörwaldd2034312007-05-18 16:29:38 +0000549PyObject *PyUnicode_FromString(const char *u)
550{
551 size_t size = strlen(u);
552 if (size > PY_SSIZE_T_MAX) {
553 PyErr_SetString(PyExc_OverflowError, "input too long");
554 return NULL;
555 }
556
557 return PyUnicode_FromStringAndSize(u, size);
558}
559
Guido van Rossumd57fd912000-03-10 22:53:23 +0000560#ifdef HAVE_WCHAR_H
561
562PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000563 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564{
565 PyUnicodeObject *unicode;
566
567 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000568 if (size == 0)
569 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
Martin v. Löwis790465f2008-04-05 20:41:37 +0000574 if (size == -1) {
575 size = wcslen(w);
576 }
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578 unicode = _PyUnicode_New(size);
579 if (!unicode)
580 return NULL;
581
582 /* Copy the wchar_t data into the new object */
583#ifdef HAVE_USABLE_WCHAR_T
584 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000585#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 {
587 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000590 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 *u++ = *w++;
592 }
593#endif
594
595 return (PyObject *)unicode;
596}
597
Walter Dörwald346737f2007-05-31 10:44:43 +0000598static void
599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
600{
601 *fmt++ = '%';
602 if (width) {
603 if (zeropad)
604 *fmt++ = '0';
605 fmt += sprintf(fmt, "%d", width);
606 }
607 if (precision)
608 fmt += sprintf(fmt, ".%d", precision);
609 if (longflag)
610 *fmt++ = 'l';
611 else if (size_tflag) {
612 char *f = PY_FORMAT_SIZE_T;
613 while (*f)
614 *fmt++ = *f++;
615 }
616 *fmt++ = c;
617 *fmt = '\0';
618}
619
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
621
622PyObject *
623PyUnicode_FromFormatV(const char *format, va_list vargs)
624{
625 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000626 Py_ssize_t callcount = 0;
627 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000628 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000629 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000630 int width = 0;
631 int precision = 0;
632 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000633 const char* f;
634 Py_UNICODE *s;
635 PyObject *string;
636 /* used by sprintf */
637 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000638 /* use abuffer instead of buffer, if we need more space
639 * (which can happen if there's a format specifier with width). */
640 char *abuffer = NULL;
641 char *realbuffer;
642 Py_ssize_t abuffersize = 0;
643 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000644 const char *copy;
645
646#ifdef VA_LIST_IS_ARRAY
647 Py_MEMCPY(count, vargs, sizeof(va_list));
648#else
649#ifdef __va_copy
650 __va_copy(count, vargs);
651#else
652 count = vargs;
653#endif
654#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 /* step 1: count the number of %S/%R/%A format specifications
656 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
657 * these objects once during step 3 and put the result in
658 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000660 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 ++callcount;
662 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000663 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000664 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000665 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000666 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (!callresults) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 callresult = callresults;
672 }
673 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 for (f = format; *f; f++) {
675 if (*f == '%') {
676 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000677 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000678 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681 ;
682
683 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
684 * they don't affect the amount of space we reserve.
685 */
686 if ((*f == 'l' || *f == 'z') &&
687 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000688 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000689
690 switch (*f) {
691 case 'c':
692 (void)va_arg(count, int);
693 /* fall through... */
694 case '%':
695 n++;
696 break;
697 case 'd': case 'u': case 'i': case 'x':
698 (void) va_arg(count, int);
699 /* 20 bytes is enough to hold a 64-bit
700 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000701 This isn't enough for octal.
702 If a width is specified we need more
703 (which we allocate later). */
704 if (width < 20)
705 width = 20;
706 n += width;
707 if (abuffersize < width)
708 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 break;
710 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000711 {
712 /* UTF-8 */
713 unsigned char*s;
714 s = va_arg(count, unsigned char*);
715 while (*s) {
716 if (*s < 128) {
717 n++; s++;
718 } else if (*s < 0xc0) {
719 /* invalid UTF-8 */
720 n++; s++;
721 } else if (*s < 0xc0) {
722 n++;
723 s++; if(!*s)break;
724 s++;
725 } else if (*s < 0xe0) {
726 n++;
727 s++; if(!*s)break;
728 s++; if(!*s)break;
729 s++;
730 } else {
731 #ifdef Py_UNICODE_WIDE
732 n++;
733 #else
734 n+=2;
735 #endif
736 s++; if(!*s)break;
737 s++; if(!*s)break;
738 s++; if(!*s)break;
739 s++;
740 }
741 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 case 'U':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 assert(obj && PyUnicode_Check(obj));
748 n += PyUnicode_GET_SIZE(obj);
749 break;
750 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000751 case 'V':
752 {
753 PyObject *obj = va_arg(count, PyObject *);
754 const char *str = va_arg(count, const char *);
755 assert(obj || str);
756 assert(!obj || PyUnicode_Check(obj));
757 if (obj)
758 n += PyUnicode_GET_SIZE(obj);
759 else
760 n += strlen(str);
761 break;
762 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 case 'S':
764 {
765 PyObject *obj = va_arg(count, PyObject *);
766 PyObject *str;
767 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000768 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000769 if (!str)
770 goto fail;
771 n += PyUnicode_GET_SIZE(str);
772 /* Remember the str and switch to the next slot */
773 *callresult++ = str;
774 break;
775 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000776 case 'R':
777 {
778 PyObject *obj = va_arg(count, PyObject *);
779 PyObject *repr;
780 assert(obj);
781 repr = PyObject_Repr(obj);
782 if (!repr)
783 goto fail;
784 n += PyUnicode_GET_SIZE(repr);
785 /* Remember the repr and switch to the next slot */
786 *callresult++ = repr;
787 break;
788 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000789 case 'A':
790 {
791 PyObject *obj = va_arg(count, PyObject *);
792 PyObject *ascii;
793 assert(obj);
794 ascii = PyObject_ASCII(obj);
795 if (!ascii)
796 goto fail;
797 n += PyUnicode_GET_SIZE(ascii);
798 /* Remember the repr and switch to the next slot */
799 *callresult++ = ascii;
800 break;
801 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 case 'p':
803 (void) va_arg(count, int);
804 /* maximum 64-bit pointer representation:
805 * 0xffffffffffffffff
806 * so 19 characters is enough.
807 * XXX I count 18 -- what's the extra for?
808 */
809 n += 19;
810 break;
811 default:
812 /* if we stumble upon an unknown
813 formatting code, copy the rest of
814 the format string to the output
815 string. (we cannot just skip the
816 code, since there's no way to know
817 what's in the argument list) */
818 n += strlen(p);
819 goto expand;
820 }
821 } else
822 n++;
823 }
824 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000825 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000826 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (!abuffer) {
828 PyErr_NoMemory();
829 goto fail;
830 }
831 realbuffer = abuffer;
832 }
833 else
834 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000835 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 we don't have to resize the string.
838 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000839 string = PyUnicode_FromUnicode(NULL, n);
840 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842
843 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845
846 for (f = format; *f; f++) {
847 if (*f == '%') {
848 const char* p = f++;
849 int longflag = 0;
850 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 zeropad = (*f == '0');
852 /* parse the width.precision part */
853 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 width = (width*10) + *f++ - '0';
856 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 if (*f == '.') {
858 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000859 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 /* handle the long flag, but only for %ld and %lu.
863 others can be added when necessary. */
864 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
865 longflag = 1;
866 ++f;
867 }
868 /* handle the size_t flag. */
869 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
870 size_tflag = 1;
871 ++f;
872 }
873
874 switch (*f) {
875 case 'c':
876 *s++ = va_arg(vargs, int);
877 break;
878 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000879 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000880 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, int));
886 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
888 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000889 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000890 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000904 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
905 sprintf(realbuffer, fmt, va_arg(vargs, int));
906 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907 break;
908 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000909 {
910 /* Parameter must be UTF-8 encoded.
911 In case of encoding errors, use
912 the replacement character. */
913 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000914 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000915 u = PyUnicode_DecodeUTF8(p, strlen(p),
916 "replace");
917 if (!u)
918 goto fail;
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
920 PyUnicode_GET_SIZE(u));
921 s += PyUnicode_GET_SIZE(u);
922 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000923 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000924 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 case 'U':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000928 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
929 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
930 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000931 break;
932 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000933 case 'V':
934 {
935 PyObject *obj = va_arg(vargs, PyObject *);
936 const char *str = va_arg(vargs, const char *);
937 if (obj) {
938 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
939 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
940 s += size;
941 } else {
942 appendstring(str);
943 }
944 break;
945 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000946 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 case 'R':
948 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 Py_UNICODE *ucopy;
950 Py_ssize_t usize;
951 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 /* unused, since we already have the result */
953 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000954 ucopy = PyUnicode_AS_UNICODE(*callresult);
955 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 for (upos = 0; upos<usize;)
957 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000958 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000959 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 ++callresult;
962 break;
963 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000964 case 'p':
965 sprintf(buffer, "%p", va_arg(vargs, void*));
966 /* %p is ill-defined: ensure leading 0x. */
967 if (buffer[1] == 'X')
968 buffer[1] = 'x';
969 else if (buffer[1] != 'x') {
970 memmove(buffer+2, buffer, strlen(buffer)+1);
971 buffer[0] = '0';
972 buffer[1] = 'x';
973 }
974 appendstring(buffer);
975 break;
976 case '%':
977 *s++ = '%';
978 break;
979 default:
980 appendstring(p);
981 goto end;
982 }
983 } else
984 *s++ = *f;
985 }
986
987 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000988 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000989 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000990 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000992 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
993 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000994 fail:
995 if (callresults) {
996 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000997 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000998 Py_DECREF(*callresult2);
999 ++callresult2;
1000 }
Christian Heimesb186d002008-03-18 15:15:01 +00001001 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001002 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001003 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001004 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001005 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001006}
1007
1008#undef appendstring
1009
1010PyObject *
1011PyUnicode_FromFormat(const char *format, ...)
1012{
1013 PyObject* ret;
1014 va_list vargs;
1015
1016#ifdef HAVE_STDARG_PROTOTYPES
1017 va_start(vargs, format);
1018#else
1019 va_start(vargs);
1020#endif
1021 ret = PyUnicode_FromFormatV(format, vargs);
1022 va_end(vargs);
1023 return ret;
1024}
1025
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1027 wchar_t *w,
1028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029{
1030 if (unicode == NULL) {
1031 PyErr_BadInternalCall();
1032 return -1;
1033 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001034
1035 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037 size = PyUnicode_GET_SIZE(unicode) + 1;
1038
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039#ifdef HAVE_USABLE_WCHAR_T
1040 memcpy(w, unicode->str, size * sizeof(wchar_t));
1041#else
1042 {
1043 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001044 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001046 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 *w++ = *u++;
1048 }
1049#endif
1050
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001051 if (size > PyUnicode_GET_SIZE(unicode))
1052 return PyUnicode_GET_SIZE(unicode);
1053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 return size;
1055}
1056
1057#endif
1058
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059PyObject *PyUnicode_FromOrdinal(int ordinal)
1060{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001061 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001063 if (ordinal < 0 || ordinal > 0x10ffff) {
1064 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001065 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001066 return NULL;
1067 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001068
1069#ifndef Py_UNICODE_WIDE
1070 if (ordinal > 0xffff) {
1071 ordinal -= 0x10000;
1072 s[0] = 0xD800 | (ordinal >> 10);
1073 s[1] = 0xDC00 | (ordinal & 0x3FF);
1074 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075 }
1076#endif
1077
Hye-Shik Chang40574832004-04-06 07:24:51 +00001078 s[0] = (Py_UNICODE)ordinal;
1079 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001080}
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082PyObject *PyUnicode_FromObject(register PyObject *obj)
1083{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001085 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 if (PyUnicode_CheckExact(obj)) {
1087 Py_INCREF(obj);
1088 return obj;
1089 }
1090 if (PyUnicode_Check(obj)) {
1091 /* For a Unicode subtype that's not a Unicode object,
1092 return a true Unicode object with the same data. */
1093 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1094 PyUnicode_GET_SIZE(obj));
1095 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001096 PyErr_Format(PyExc_TypeError,
1097 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001098 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100}
1101
1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1103 const char *encoding,
1104 const char *errors)
1105{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001107 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (obj == NULL) {
1111 PyErr_BadInternalCall();
1112 return NULL;
1113 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115 if (PyUnicode_Check(obj)) {
1116 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001117 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120
1121 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001122 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001123 s = PyBytes_AS_STRING(obj);
1124 len = PyBytes_GET_SIZE(obj);
1125 }
1126 else if (PyByteArray_Check(obj)) {
1127 s = PyByteArray_AS_STRING(obj);
1128 len = PyByteArray_GET_SIZE(obj);
1129 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001130 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1131 /* Overwrite the error message with something more useful in
1132 case of a TypeError. */
1133 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001134 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001135 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001137 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001138 goto onError;
1139 }
Tim Petersced69f82003-09-16 20:30:58 +00001140
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001141 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 if (len == 0) {
1143 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 }
Tim Petersced69f82003-09-16 20:30:58 +00001146 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001148
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 return v;
1150
1151 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153}
1154
1155PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 const char *encoding,
1158 const char *errors)
1159{
1160 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001161 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 char lower[20]; /* Enough for any encoding name we recognize */
1163 char *l;
1164 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165
1166 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Convert encoding to lower case and replace '_' with '-' in order to
1170 catch e.g. UTF_8 */
1171 e = encoding;
1172 l = lower;
1173 while (*e && l < &lower[(sizeof lower) - 2]) {
1174 if (ISUPPER(*e)) {
1175 *l++ = TOLOWER(*e++);
1176 }
1177 else if (*e == '_') {
1178 *l++ = '-';
1179 e++;
1180 }
1181 else {
1182 *l++ = *e++;
1183 }
1184 }
1185 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001186
1187 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001188 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 else if ((strcmp(lower, "latin-1") == 0) ||
1191 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001197 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "utf-16") == 0)
1200 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1201 else if (strcmp(lower, "utf-32") == 0)
1202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203
1204 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001205 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001208 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (buffer == NULL)
1210 goto onError;
1211 unicode = PyCodec_Decode(buffer, encoding, errors);
1212 if (unicode == NULL)
1213 goto onError;
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001216 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001217 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_DECREF(unicode);
1219 goto onError;
1220 }
1221 Py_DECREF(buffer);
1222 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 onError:
1225 Py_XDECREF(buffer);
1226 return NULL;
1227}
1228
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1230 const char *encoding,
1231 const char *errors)
1232{
1233 PyObject *v;
1234
1235 if (!PyUnicode_Check(unicode)) {
1236 PyErr_BadArgument();
1237 goto onError;
1238 }
1239
1240 if (encoding == NULL)
1241 encoding = PyUnicode_GetDefaultEncoding();
1242
1243 /* Decode via the codec registry */
1244 v = PyCodec_Decode(unicode, encoding, errors);
1245 if (v == NULL)
1246 goto onError;
1247 return v;
1248
1249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1254 const char *encoding,
1255 const char *errors)
1256{
1257 PyObject *v;
1258
1259 if (!PyUnicode_Check(unicode)) {
1260 PyErr_BadArgument();
1261 goto onError;
1262 }
1263
1264 if (encoding == NULL)
1265 encoding = PyUnicode_GetDefaultEncoding();
1266
1267 /* Decode via the codec registry */
1268 v = PyCodec_Decode(unicode, encoding, errors);
1269 if (v == NULL)
1270 goto onError;
1271 if (!PyUnicode_Check(v)) {
1272 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001273 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001274 Py_TYPE(v)->tp_name);
1275 Py_DECREF(v);
1276 goto onError;
1277 }
1278 return v;
1279
1280 onError:
1281 return NULL;
1282}
1283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 unicode = PyUnicode_FromUnicode(s, size);
1292 if (unicode == NULL)
1293 return NULL;
1294 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1295 Py_DECREF(unicode);
1296 return v;
1297}
1298
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1300 const char *encoding,
1301 const char *errors)
1302{
1303 PyObject *v;
1304
1305 if (!PyUnicode_Check(unicode)) {
1306 PyErr_BadArgument();
1307 goto onError;
1308 }
1309
1310 if (encoding == NULL)
1311 encoding = PyUnicode_GetDefaultEncoding();
1312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 return v;
1318
1319 onError:
1320 return NULL;
1321}
1322
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1324 const char *encoding,
1325 const char *errors)
1326{
1327 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 if (!PyUnicode_Check(unicode)) {
1330 PyErr_BadArgument();
1331 goto onError;
1332 }
Fred Drakee4315f52000-05-09 19:53:39 +00001333
Tim Petersced69f82003-09-16 20:30:58 +00001334 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001335 encoding = PyUnicode_GetDefaultEncoding();
1336
1337 /* Shortcuts for common default encodings */
1338 if (errors == NULL) {
1339 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001340 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001341 else if (strcmp(encoding, "latin-1") == 0)
1342 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1344 else if (strcmp(encoding, "mbcs") == 0)
1345 return PyUnicode_AsMBCSString(unicode);
1346#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001347 else if (strcmp(encoding, "ascii") == 0)
1348 return PyUnicode_AsASCIIString(unicode);
1349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350
1351 /* Encode via the codec registry */
1352 v = PyCodec_Encode(unicode, encoding, errors);
1353 if (v == NULL)
1354 goto onError;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001355 if (PyByteArray_Check(v)) {
1356 char msg[100];
1357 PyOS_snprintf(msg, sizeof(msg),
1358 "encoder %s returned buffer instead of bytes",
1359 encoding);
1360 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1361 v = NULL;
1362 goto onError;
1363 }
1364 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1365 }
1366 else if (!PyBytes_Check(v)) {
1367 PyErr_Format(PyExc_TypeError,
1368 "encoder did not return a bytes object (type=%.400s)",
1369 Py_TYPE(v)->tp_name);
1370 v = NULL;
1371 }
1372 return v;
1373
1374 onError:
1375 return NULL;
1376}
1377
1378PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
1383
1384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
1386 goto onError;
1387 }
1388
1389 if (encoding == NULL)
1390 encoding = PyUnicode_GetDefaultEncoding();
1391
1392 /* Encode via the codec registry */
1393 v = PyCodec_Encode(unicode, encoding, errors);
1394 if (v == NULL)
1395 goto onError;
1396 if (!PyUnicode_Check(v)) {
1397 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001398 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399 Py_TYPE(v)->tp_name);
1400 Py_DECREF(v);
1401 goto onError;
1402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001404
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 onError:
1406 return NULL;
1407}
1408
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001409PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1410 const char *errors)
1411{
1412 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001413 if (v)
1414 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001415 if (errors != NULL)
1416 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001417 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001418 PyUnicode_GET_SIZE(unicode),
1419 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001420 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001421 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001422 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001423 return v;
1424}
1425
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001426PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001427PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001428 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001429 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1430}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001431
Christian Heimes5894ba72007-11-04 11:43:14 +00001432PyObject*
1433PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1434{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001435 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1436 can be undefined. If it is case, decode using UTF-8. The following assumes
1437 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1438 bootstrapping process where the codecs aren't ready yet.
1439 */
1440 if (Py_FileSystemDefaultEncoding) {
1441#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001442 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001443 return PyUnicode_DecodeMBCS(s, size, "replace");
1444 }
1445#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001446 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447 return PyUnicode_DecodeUTF8(s, size, "replace");
1448 }
1449#endif
1450 return PyUnicode_Decode(s, size,
1451 Py_FileSystemDefaultEncoding,
1452 "replace");
1453 }
1454 else {
1455 return PyUnicode_DecodeUTF8(s, size, "replace");
1456 }
1457}
1458
Martin v. Löwis5b222132007-06-10 09:51:05 +00001459char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001460_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001461{
Christian Heimesf3863112007-11-22 07:46:41 +00001462 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001463 if (!PyUnicode_Check(unicode)) {
1464 PyErr_BadArgument();
1465 return NULL;
1466 }
Christian Heimesf3863112007-11-22 07:46:41 +00001467 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1468 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001469 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001470 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001471 *psize = PyBytes_GET_SIZE(bytes);
1472 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001473}
1474
1475char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001476_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001477{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001478 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001479}
1480
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1482{
1483 if (!PyUnicode_Check(unicode)) {
1484 PyErr_BadArgument();
1485 goto onError;
1486 }
1487 return PyUnicode_AS_UNICODE(unicode);
1488
1489 onError:
1490 return NULL;
1491}
1492
Martin v. Löwis18e16552006-02-15 17:27:45 +00001493Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494{
1495 if (!PyUnicode_Check(unicode)) {
1496 PyErr_BadArgument();
1497 goto onError;
1498 }
1499 return PyUnicode_GET_SIZE(unicode);
1500
1501 onError:
1502 return -1;
1503}
1504
Thomas Wouters78890102000-07-22 19:25:51 +00001505const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001506{
1507 return unicode_default_encoding;
1508}
1509
1510int PyUnicode_SetDefaultEncoding(const char *encoding)
1511{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001512 if (strcmp(encoding, unicode_default_encoding) != 0) {
1513 PyErr_Format(PyExc_ValueError,
1514 "Can only set default encoding to %s",
1515 unicode_default_encoding);
1516 return -1;
1517 }
Fred Drakee4315f52000-05-09 19:53:39 +00001518 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001519}
1520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001521/* error handling callback helper:
1522 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001523 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 and adjust various state variables.
1525 return 0 on success, -1 on error
1526*/
1527
1528static
1529int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1530 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001531 const char **input, const char **inend, Py_ssize_t *startinpos,
1532 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001533 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001535 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536
1537 PyObject *restuple = NULL;
1538 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001540 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001541 Py_ssize_t requiredsize;
1542 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001544 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001545 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 int res = -1;
1547
1548 if (*errorHandler == NULL) {
1549 *errorHandler = PyCodec_LookupError(errors);
1550 if (*errorHandler == NULL)
1551 goto onError;
1552 }
1553
1554 if (*exceptionObject == NULL) {
1555 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001556 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557 if (*exceptionObject == NULL)
1558 goto onError;
1559 }
1560 else {
1561 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1562 goto onError;
1563 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1564 goto onError;
1565 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1566 goto onError;
1567 }
1568
1569 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1570 if (restuple == NULL)
1571 goto onError;
1572 if (!PyTuple_Check(restuple)) {
1573 PyErr_Format(PyExc_TypeError, &argparse[4]);
1574 goto onError;
1575 }
1576 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1577 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001578
1579 /* Copy back the bytes variables, which might have been modified by the
1580 callback */
1581 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1582 if (!inputobj)
1583 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001584 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001585 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1586 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001587 *input = PyBytes_AS_STRING(inputobj);
1588 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001589 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001590 /* we can DECREF safely, as the exception has another reference,
1591 so the object won't go away. */
1592 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001595 newpos = insize+newpos;
1596 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001597 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001598 goto onError;
1599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600
1601 /* need more space? (at least enough for what we
1602 have+the replacement+the rest of the string (starting
1603 at the new input position), so we won't have to check space
1604 when there are no errors in the rest of the string) */
1605 repptr = PyUnicode_AS_UNICODE(repunicode);
1606 repsize = PyUnicode_GET_SIZE(repunicode);
1607 requiredsize = *outpos + repsize + insize-newpos;
1608 if (requiredsize > outsize) {
1609 if (requiredsize<2*outsize)
1610 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001611 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 goto onError;
1613 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1614 }
1615 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001616 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE_COPY(*outptr, repptr, repsize);
1618 *outptr += repsize;
1619 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 /* we made it! */
1622 res = 0;
1623
1624 onError:
1625 Py_XDECREF(restuple);
1626 return res;
1627}
1628
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629/* --- UTF-7 Codec -------------------------------------------------------- */
1630
1631/* see RFC2152 for details */
1632
Tim Petersced69f82003-09-16 20:30:58 +00001633static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634char utf7_special[128] = {
1635 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1636 encoded:
1637 0 - not special
1638 1 - special
1639 2 - whitespace (optional)
1640 3 - RFC2152 Set O (optional) */
1641 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1643 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1645 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1647 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1649
1650};
1651
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001652/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1653 warnings about the comparison always being false; since
1654 utf7_special[0] is 1, we can safely make that one comparison
1655 true */
1656
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001658 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001659 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 (encodeO && (utf7_special[(c)] == 3)))
1661
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001662#define B64(n) \
1663 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1664#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001665 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001666#define UB64(c) \
1667 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1668 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001670#define ENCODE(out, ch, bits) \
1671 while (bits >= 6) { \
1672 *out++ = B64(ch >> (bits-6)); \
1673 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 }
1675
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001676#define DECODE(out, ch, bits, surrogate) \
1677 while (bits >= 16) { \
1678 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1679 bits -= 16; \
1680 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001681 /* We have already generated an error for the high surrogate \
1682 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001683 surrogate = 0; \
1684 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001686 it in a 16-bit character */ \
1687 surrogate = 1; \
1688 errmsg = "code pairs are not supported"; \
1689 goto utf7Error; \
1690 } else { \
1691 *out++ = outCh; \
1692 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001693 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 const char *errors)
1698{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001699 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1700}
1701
1702PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1703 Py_ssize_t size,
1704 const char *errors,
1705 Py_ssize_t *consumed)
1706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001708 Py_ssize_t startinpos;
1709 Py_ssize_t endinpos;
1710 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001711 const char *e;
1712 PyUnicodeObject *unicode;
1713 Py_UNICODE *p;
1714 const char *errmsg = "";
1715 int inShift = 0;
1716 unsigned int bitsleft = 0;
1717 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 int surrogate = 0;
1719 PyObject *errorHandler = NULL;
1720 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721
1722 unicode = _PyUnicode_New(size);
1723 if (!unicode)
1724 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001725 if (size == 0) {
1726 if (consumed)
1727 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730
1731 p = unicode->str;
1732 e = s + size;
1733
1734 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 Py_UNICODE ch;
1736 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001737 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738
1739 if (inShift) {
1740 if ((ch == '-') || !B64CHAR(ch)) {
1741 inShift = 0;
1742 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001743
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1745 if (bitsleft >= 6) {
1746 /* The shift sequence has a partial character in it. If
1747 bitsleft < 6 then we could just classify it as padding
1748 but that is not the case here */
1749
1750 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001751 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001752 }
1753 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001754 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 here so indicate the potential of a misencoded character. */
1756
1757 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1758 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1759 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001760 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 }
1762
1763 if (ch == '-') {
1764 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001765 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 inShift = 1;
1767 }
1768 } else if (SPECIAL(ch,0,0)) {
1769 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001770 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 } else {
1772 *p++ = ch;
1773 }
1774 } else {
1775 charsleft = (charsleft << 6) | UB64(ch);
1776 bitsleft += 6;
1777 s++;
1778 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1779 }
1780 }
1781 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 s++;
1784 if (s < e && *s == '-') {
1785 s++;
1786 *p++ = '+';
1787 } else
1788 {
1789 inShift = 1;
1790 bitsleft = 0;
1791 }
1792 }
1793 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001794 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001795 errmsg = "unexpected special character";
1796 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001797 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798 }
1799 else {
1800 *p++ = ch;
1801 s++;
1802 }
1803 continue;
1804 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 outpos = p-PyUnicode_AS_UNICODE(unicode);
1806 endinpos = s-starts;
1807 if (unicode_decode_call_errorhandler(
1808 errors, &errorHandler,
1809 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001810 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 (PyObject **)&unicode, &outpos, &p))
1812 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 }
1814
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001815 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 outpos = p-PyUnicode_AS_UNICODE(unicode);
1817 endinpos = size;
1818 if (unicode_decode_call_errorhandler(
1819 errors, &errorHandler,
1820 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001821 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 if (s < e)
1825 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001826 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001827 if (consumed) {
1828 if(inShift)
1829 *consumed = startinpos;
1830 else
1831 *consumed = s-starts;
1832 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001833
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001834 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835 goto onError;
1836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 Py_XDECREF(errorHandler);
1838 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 return (PyObject *)unicode;
1840
1841onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 Py_XDECREF(errorHandler);
1843 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 Py_DECREF(unicode);
1845 return NULL;
1846}
1847
1848
1849PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001850 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 int encodeSetO,
1852 int encodeWhiteSpace,
1853 const char *errors)
1854{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001855 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001857 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001859 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 unsigned int bitsleft = 0;
1861 unsigned long charsleft = 0;
1862 char * out;
1863 char * start;
1864
1865 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001866 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001867
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001868 if (cbAllocated / 5 != size)
1869 return PyErr_NoMemory();
1870
Christian Heimes9c4756e2008-05-26 13:22:05 +00001871 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 if (v == NULL)
1873 return NULL;
1874
Christian Heimes9c4756e2008-05-26 13:22:05 +00001875 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 for (;i < size; ++i) {
1877 Py_UNICODE ch = s[i];
1878
1879 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001880 if (ch == '+') {
1881 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001882 *out++ = '-';
1883 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1884 charsleft = ch;
1885 bitsleft = 16;
1886 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001887 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001889 } else {
1890 *out++ = (char) ch;
1891 }
1892 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1894 *out++ = B64(charsleft << (6-bitsleft));
1895 charsleft = 0;
1896 bitsleft = 0;
1897 /* Characters not in the BASE64 set implicitly unshift the sequence
1898 so no '-' is required, except if the character is itself a '-' */
1899 if (B64CHAR(ch) || ch == '-') {
1900 *out++ = '-';
1901 }
1902 inShift = 0;
1903 *out++ = (char) ch;
1904 } else {
1905 bitsleft += 16;
1906 charsleft = (charsleft << 16) | ch;
1907 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1908
1909 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001910 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911 or '-' then the shift sequence will be terminated implicitly and we
1912 don't have to insert a '-'. */
1913
1914 if (bitsleft == 0) {
1915 if (i + 1 < size) {
1916 Py_UNICODE ch2 = s[i+1];
1917
1918 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001919
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920 } else if (B64CHAR(ch2) || ch2 == '-') {
1921 *out++ = '-';
1922 inShift = 0;
1923 } else {
1924 inShift = 0;
1925 }
1926
1927 }
1928 else {
1929 *out++ = '-';
1930 inShift = 0;
1931 }
1932 }
Tim Petersced69f82003-09-16 20:30:58 +00001933 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001935 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936 if (bitsleft) {
1937 *out++= B64(charsleft << (6-bitsleft) );
1938 *out++ = '-';
1939 }
1940
Christian Heimes72b710a2008-05-26 13:28:38 +00001941 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001942 Py_DECREF(v);
1943 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001944}
1945
1946#undef SPECIAL
1947#undef B64
1948#undef B64CHAR
1949#undef UB64
1950#undef ENCODE
1951#undef DECODE
1952
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953/* --- UTF-8 Codec -------------------------------------------------------- */
1954
Tim Petersced69f82003-09-16 20:30:58 +00001955static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956char utf8_code_length[256] = {
1957 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1958 illegal prefix. see RFC 2279 for details */
1959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1965 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1966 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1973 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1974 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1975};
1976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001978 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 const char *errors)
1980{
Walter Dörwald69652032004-09-07 20:24:22 +00001981 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1982}
1983
1984PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001985 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001986 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001987 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001988{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001991 Py_ssize_t startinpos;
1992 Py_ssize_t endinpos;
1993 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 const char *e;
1995 PyUnicodeObject *unicode;
1996 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001997 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001998 PyObject *errorHandler = NULL;
1999 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 /* Note: size will always be longer than the resulting Unicode
2002 character count */
2003 unicode = _PyUnicode_New(size);
2004 if (!unicode)
2005 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002006 if (size == 0) {
2007 if (consumed)
2008 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011
2012 /* Unpack UTF-8 encoded data */
2013 p = unicode->str;
2014 e = s + size;
2015
2016 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018
2019 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 s++;
2022 continue;
2023 }
2024
2025 n = utf8_code_length[ch];
2026
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002027 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002028 if (consumed)
2029 break;
2030 else {
2031 errmsg = "unexpected end of data";
2032 startinpos = s-starts;
2033 endinpos = size;
2034 goto utf8Error;
2035 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 switch (n) {
2039
2040 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002041 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 startinpos = s-starts;
2043 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002044 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
2046 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002047 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 startinpos = s-starts;
2049 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002050 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002053 if ((s[1] & 0xc0) != 0x80) {
2054 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 startinpos = s-starts;
2056 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002057 goto utf8Error;
2058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002060 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 startinpos = s-starts;
2062 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002063 errmsg = "illegal encoding";
2064 goto utf8Error;
2065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002067 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 break;
2069
2070 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002071 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002072 (s[2] & 0xc0) != 0x80) {
2073 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 startinpos = s-starts;
2075 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002076 goto utf8Error;
2077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002079 if (ch < 0x0800) {
2080 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002081 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002082
2083 XXX For wide builds (UCS-4) we should probably try
2084 to recombine the surrogates into a single code
2085 unit.
2086 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002087 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 startinpos = s-starts;
2089 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002090 goto utf8Error;
2091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002093 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002094 break;
2095
2096 case 4:
2097 if ((s[1] & 0xc0) != 0x80 ||
2098 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002099 (s[3] & 0xc0) != 0x80) {
2100 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 startinpos = s-starts;
2102 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002103 goto utf8Error;
2104 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002105 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2106 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2107 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002108 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002109 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002110 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002111 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002112 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002113 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 startinpos = s-starts;
2115 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002116 goto utf8Error;
2117 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002118#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002119 *p++ = (Py_UNICODE)ch;
2120#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002121 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002122
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002123 /* translate from 10000..10FFFF to 0..FFFF */
2124 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002125
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 /* high surrogate = top 10 bits added to D800 */
2127 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002128
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002129 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002130 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 break;
2133
2134 default:
2135 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002136 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 startinpos = s-starts;
2138 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002139 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 }
2141 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002142 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002143
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002144 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 outpos = p-PyUnicode_AS_UNICODE(unicode);
2146 if (unicode_decode_call_errorhandler(
2147 errors, &errorHandler,
2148 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002149 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 (PyObject **)&unicode, &outpos, &p))
2151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 }
Walter Dörwald69652032004-09-07 20:24:22 +00002153 if (consumed)
2154 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
2156 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002157 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 goto onError;
2159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160 Py_XDECREF(errorHandler);
2161 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 return (PyObject *)unicode;
2163
2164onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 Py_DECREF(unicode);
2168 return NULL;
2169}
2170
Tim Peters602f7402002-04-27 18:03:26 +00002171/* Allocation strategy: if the string is short, convert into a stack buffer
2172 and allocate exactly as much space needed at the end. Else allocate the
2173 maximum possible needed (4 result bytes per Unicode character), and return
2174 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002175*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002176PyObject *
2177PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002179 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180{
Tim Peters602f7402002-04-27 18:03:26 +00002181#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002182
Guido van Rossum98297ee2007-11-06 21:34:58 +00002183 Py_ssize_t i; /* index into s of next input byte */
2184 PyObject *result; /* result string object */
2185 char *p; /* next free byte in output buffer */
2186 Py_ssize_t nallocated; /* number of result bytes allocated */
2187 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002188 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002189
Tim Peters602f7402002-04-27 18:03:26 +00002190 assert(s != NULL);
2191 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192
Tim Peters602f7402002-04-27 18:03:26 +00002193 if (size <= MAX_SHORT_UNICHARS) {
2194 /* Write into the stack buffer; nallocated can't overflow.
2195 * At the end, we'll allocate exactly as much heap space as it
2196 * turns out we need.
2197 */
2198 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002199 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002200 p = stackbuf;
2201 }
2202 else {
2203 /* Overallocate on the heap, and give the excess back at the end. */
2204 nallocated = size * 4;
2205 if (nallocated / 4 != size) /* overflow! */
2206 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002207 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002208 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002209 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002210 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002211 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002212
Tim Peters602f7402002-04-27 18:03:26 +00002213 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002215
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002216 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002217 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002221 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002222 *p++ = (char)(0xc0 | (ch >> 6));
2223 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002224 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002225 else {
Tim Peters602f7402002-04-27 18:03:26 +00002226 /* Encode UCS2 Unicode ordinals */
2227 if (ch < 0x10000) {
2228 /* Special case: check for high surrogate */
2229 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2230 Py_UCS4 ch2 = s[i];
2231 /* Check for low surrogate and combine the two to
2232 form a UCS4 value */
2233 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002234 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002235 i++;
2236 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002237 }
Tim Peters602f7402002-04-27 18:03:26 +00002238 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002239 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002240 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002241 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2242 *p++ = (char)(0x80 | (ch & 0x3f));
2243 continue;
2244 }
2245encodeUCS4:
2246 /* Encode UCS4 Unicode ordinals */
2247 *p++ = (char)(0xf0 | (ch >> 18));
2248 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2249 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2250 *p++ = (char)(0x80 | (ch & 0x3f));
2251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002253
Guido van Rossum98297ee2007-11-06 21:34:58 +00002254 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002255 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002256 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002257 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002258 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002259 }
2260 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002261 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002262 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002263 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002264 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002265 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002266 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002267
Tim Peters602f7402002-04-27 18:03:26 +00002268#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269}
2270
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2272{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 if (!PyUnicode_Check(unicode)) {
2274 PyErr_BadArgument();
2275 return NULL;
2276 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002277 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2278 PyUnicode_GET_SIZE(unicode),
2279 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280}
2281
Walter Dörwald41980ca2007-08-16 21:55:45 +00002282/* --- UTF-32 Codec ------------------------------------------------------- */
2283
2284PyObject *
2285PyUnicode_DecodeUTF32(const char *s,
2286 Py_ssize_t size,
2287 const char *errors,
2288 int *byteorder)
2289{
2290 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2291}
2292
2293PyObject *
2294PyUnicode_DecodeUTF32Stateful(const char *s,
2295 Py_ssize_t size,
2296 const char *errors,
2297 int *byteorder,
2298 Py_ssize_t *consumed)
2299{
2300 const char *starts = s;
2301 Py_ssize_t startinpos;
2302 Py_ssize_t endinpos;
2303 Py_ssize_t outpos;
2304 PyUnicodeObject *unicode;
2305 Py_UNICODE *p;
2306#ifndef Py_UNICODE_WIDE
2307 int i, pairs;
2308#else
2309 const int pairs = 0;
2310#endif
2311 const unsigned char *q, *e;
2312 int bo = 0; /* assume native ordering by default */
2313 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002314 /* Offsets from q for retrieving bytes in the right order. */
2315#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2316 int iorder[] = {0, 1, 2, 3};
2317#else
2318 int iorder[] = {3, 2, 1, 0};
2319#endif
2320 PyObject *errorHandler = NULL;
2321 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002322 /* On narrow builds we split characters outside the BMP into two
2323 codepoints => count how much extra space we need. */
2324#ifndef Py_UNICODE_WIDE
2325 for (i = pairs = 0; i < size/4; i++)
2326 if (((Py_UCS4 *)s)[i] >= 0x10000)
2327 pairs++;
2328#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002329
2330 /* This might be one to much, because of a BOM */
2331 unicode = _PyUnicode_New((size+3)/4+pairs);
2332 if (!unicode)
2333 return NULL;
2334 if (size == 0)
2335 return (PyObject *)unicode;
2336
2337 /* Unpack UTF-32 encoded data */
2338 p = unicode->str;
2339 q = (unsigned char *)s;
2340 e = q + size;
2341
2342 if (byteorder)
2343 bo = *byteorder;
2344
2345 /* Check for BOM marks (U+FEFF) in the input and adjust current
2346 byte order setting accordingly. In native mode, the leading BOM
2347 mark is skipped, in all other modes, it is copied to the output
2348 stream as-is (giving a ZWNBSP character). */
2349 if (bo == 0) {
2350 if (size >= 4) {
2351 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2352 (q[iorder[1]] << 8) | q[iorder[0]];
2353#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2354 if (bom == 0x0000FEFF) {
2355 q += 4;
2356 bo = -1;
2357 }
2358 else if (bom == 0xFFFE0000) {
2359 q += 4;
2360 bo = 1;
2361 }
2362#else
2363 if (bom == 0x0000FEFF) {
2364 q += 4;
2365 bo = 1;
2366 }
2367 else if (bom == 0xFFFE0000) {
2368 q += 4;
2369 bo = -1;
2370 }
2371#endif
2372 }
2373 }
2374
2375 if (bo == -1) {
2376 /* force LE */
2377 iorder[0] = 0;
2378 iorder[1] = 1;
2379 iorder[2] = 2;
2380 iorder[3] = 3;
2381 }
2382 else if (bo == 1) {
2383 /* force BE */
2384 iorder[0] = 3;
2385 iorder[1] = 2;
2386 iorder[2] = 1;
2387 iorder[3] = 0;
2388 }
2389
2390 while (q < e) {
2391 Py_UCS4 ch;
2392 /* remaining bytes at the end? (size should be divisible by 4) */
2393 if (e-q<4) {
2394 if (consumed)
2395 break;
2396 errmsg = "truncated data";
2397 startinpos = ((const char *)q)-starts;
2398 endinpos = ((const char *)e)-starts;
2399 goto utf32Error;
2400 /* The remaining input chars are ignored if the callback
2401 chooses to skip the input */
2402 }
2403 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2404 (q[iorder[1]] << 8) | q[iorder[0]];
2405
2406 if (ch >= 0x110000)
2407 {
2408 errmsg = "codepoint not in range(0x110000)";
2409 startinpos = ((const char *)q)-starts;
2410 endinpos = startinpos+4;
2411 goto utf32Error;
2412 }
2413#ifndef Py_UNICODE_WIDE
2414 if (ch >= 0x10000)
2415 {
2416 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2417 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2418 }
2419 else
2420#endif
2421 *p++ = ch;
2422 q += 4;
2423 continue;
2424 utf32Error:
2425 outpos = p-PyUnicode_AS_UNICODE(unicode);
2426 if (unicode_decode_call_errorhandler(
2427 errors, &errorHandler,
2428 "utf32", errmsg,
2429 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2430 (PyObject **)&unicode, &outpos, &p))
2431 goto onError;
2432 }
2433
2434 if (byteorder)
2435 *byteorder = bo;
2436
2437 if (consumed)
2438 *consumed = (const char *)q-starts;
2439
2440 /* Adjust length */
2441 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2442 goto onError;
2443
2444 Py_XDECREF(errorHandler);
2445 Py_XDECREF(exc);
2446 return (PyObject *)unicode;
2447
2448onError:
2449 Py_DECREF(unicode);
2450 Py_XDECREF(errorHandler);
2451 Py_XDECREF(exc);
2452 return NULL;
2453}
2454
2455PyObject *
2456PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2457 Py_ssize_t size,
2458 const char *errors,
2459 int byteorder)
2460{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002461 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002462 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002463 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002464#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002465 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002466#else
2467 const int pairs = 0;
2468#endif
2469 /* Offsets from p for storing byte pairs in the right order. */
2470#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2471 int iorder[] = {0, 1, 2, 3};
2472#else
2473 int iorder[] = {3, 2, 1, 0};
2474#endif
2475
2476#define STORECHAR(CH) \
2477 do { \
2478 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2479 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2480 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2481 p[iorder[0]] = (CH) & 0xff; \
2482 p += 4; \
2483 } while(0)
2484
2485 /* In narrow builds we can output surrogate pairs as one codepoint,
2486 so we need less space. */
2487#ifndef Py_UNICODE_WIDE
2488 for (i = pairs = 0; i < size-1; i++)
2489 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2490 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2491 pairs++;
2492#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002493 nsize = (size - pairs + (byteorder == 0));
2494 bytesize = nsize * 4;
2495 if (bytesize / 4 != nsize)
2496 return PyErr_NoMemory();
2497 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002498 if (v == NULL)
2499 return NULL;
2500
Christian Heimes9c4756e2008-05-26 13:22:05 +00002501 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002502 if (byteorder == 0)
2503 STORECHAR(0xFEFF);
2504 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002505 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002506
2507 if (byteorder == -1) {
2508 /* force LE */
2509 iorder[0] = 0;
2510 iorder[1] = 1;
2511 iorder[2] = 2;
2512 iorder[3] = 3;
2513 }
2514 else if (byteorder == 1) {
2515 /* force BE */
2516 iorder[0] = 3;
2517 iorder[1] = 2;
2518 iorder[2] = 1;
2519 iorder[3] = 0;
2520 }
2521
2522 while (size-- > 0) {
2523 Py_UCS4 ch = *s++;
2524#ifndef Py_UNICODE_WIDE
2525 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2526 Py_UCS4 ch2 = *s;
2527 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2528 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2529 s++;
2530 size--;
2531 }
2532 }
2533#endif
2534 STORECHAR(ch);
2535 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002536
2537 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002538 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002539 Py_DECREF(v);
2540 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002541#undef STORECHAR
2542}
2543
2544PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2545{
2546 if (!PyUnicode_Check(unicode)) {
2547 PyErr_BadArgument();
2548 return NULL;
2549 }
2550 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2551 PyUnicode_GET_SIZE(unicode),
2552 NULL,
2553 0);
2554}
2555
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556/* --- UTF-16 Codec ------------------------------------------------------- */
2557
Tim Peters772747b2001-08-09 22:21:55 +00002558PyObject *
2559PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002561 const char *errors,
2562 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563{
Walter Dörwald69652032004-09-07 20:24:22 +00002564 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2565}
2566
2567PyObject *
2568PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002570 const char *errors,
2571 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002572 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002574 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002575 Py_ssize_t startinpos;
2576 Py_ssize_t endinpos;
2577 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 PyUnicodeObject *unicode;
2579 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002580 const unsigned char *q, *e;
2581 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002582 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002583 /* Offsets from q for retrieving byte pairs in the right order. */
2584#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2585 int ihi = 1, ilo = 0;
2586#else
2587 int ihi = 0, ilo = 1;
2588#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 PyObject *errorHandler = NULL;
2590 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591
2592 /* Note: size will always be longer than the resulting Unicode
2593 character count */
2594 unicode = _PyUnicode_New(size);
2595 if (!unicode)
2596 return NULL;
2597 if (size == 0)
2598 return (PyObject *)unicode;
2599
2600 /* Unpack UTF-16 encoded data */
2601 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002602 q = (unsigned char *)s;
2603 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002606 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002608 /* Check for BOM marks (U+FEFF) in the input and adjust current
2609 byte order setting accordingly. In native mode, the leading BOM
2610 mark is skipped, in all other modes, it is copied to the output
2611 stream as-is (giving a ZWNBSP character). */
2612 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002613 if (size >= 2) {
2614 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002615#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002616 if (bom == 0xFEFF) {
2617 q += 2;
2618 bo = -1;
2619 }
2620 else if (bom == 0xFFFE) {
2621 q += 2;
2622 bo = 1;
2623 }
Tim Petersced69f82003-09-16 20:30:58 +00002624#else
Walter Dörwald69652032004-09-07 20:24:22 +00002625 if (bom == 0xFEFF) {
2626 q += 2;
2627 bo = 1;
2628 }
2629 else if (bom == 0xFFFE) {
2630 q += 2;
2631 bo = -1;
2632 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002633#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002634 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636
Tim Peters772747b2001-08-09 22:21:55 +00002637 if (bo == -1) {
2638 /* force LE */
2639 ihi = 1;
2640 ilo = 0;
2641 }
2642 else if (bo == 1) {
2643 /* force BE */
2644 ihi = 0;
2645 ilo = 1;
2646 }
2647
2648 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002650 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002652 if (consumed)
2653 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 errmsg = "truncated data";
2655 startinpos = ((const char *)q)-starts;
2656 endinpos = ((const char *)e)-starts;
2657 goto utf16Error;
2658 /* The remaining input chars are ignored if the callback
2659 chooses to skip the input */
2660 }
2661 ch = (q[ihi] << 8) | q[ilo];
2662
Tim Peters772747b2001-08-09 22:21:55 +00002663 q += 2;
2664
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 if (ch < 0xD800 || ch > 0xDFFF) {
2666 *p++ = ch;
2667 continue;
2668 }
2669
2670 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002671 if (q >= e) {
2672 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 startinpos = (((const char *)q)-2)-starts;
2674 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002675 goto utf16Error;
2676 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002677 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002678 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2679 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002680 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002681#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002682 *p++ = ch;
2683 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002684#else
2685 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002686#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002687 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002688 }
2689 else {
2690 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 startinpos = (((const char *)q)-4)-starts;
2692 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002693 goto utf16Error;
2694 }
2695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002697 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 startinpos = (((const char *)q)-2)-starts;
2699 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002700 /* Fall through to report the error */
2701
2702 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 outpos = p-PyUnicode_AS_UNICODE(unicode);
2704 if (unicode_decode_call_errorhandler(
2705 errors, &errorHandler,
2706 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002707 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002709 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 }
2711
2712 if (byteorder)
2713 *byteorder = bo;
2714
Walter Dörwald69652032004-09-07 20:24:22 +00002715 if (consumed)
2716 *consumed = (const char *)q-starts;
2717
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002719 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 goto onError;
2721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 return (PyObject *)unicode;
2725
2726onError:
2727 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return NULL;
2731}
2732
Tim Peters772747b2001-08-09 22:21:55 +00002733PyObject *
2734PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002735 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002736 const char *errors,
2737 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002739 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002740 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002741 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002742#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002743 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002744#else
2745 const int pairs = 0;
2746#endif
Tim Peters772747b2001-08-09 22:21:55 +00002747 /* Offsets from p for storing byte pairs in the right order. */
2748#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2749 int ihi = 1, ilo = 0;
2750#else
2751 int ihi = 0, ilo = 1;
2752#endif
2753
2754#define STORECHAR(CH) \
2755 do { \
2756 p[ihi] = ((CH) >> 8) & 0xff; \
2757 p[ilo] = (CH) & 0xff; \
2758 p += 2; \
2759 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002761#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762 for (i = pairs = 0; i < size; i++)
2763 if (s[i] >= 0x10000)
2764 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002765#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002766 /* 2 * (size + pairs + (byteorder == 0)) */
2767 if (size > PY_SSIZE_T_MAX ||
2768 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2769 return PyErr_NoMemory();
2770 nsize = size + pairs + (byteorder == 0);
2771 bytesize = nsize * 2;
2772 if (bytesize / 2 != nsize)
2773 return PyErr_NoMemory();
2774 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 if (v == NULL)
2776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777
Christian Heimes9c4756e2008-05-26 13:22:05 +00002778 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002780 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002781 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002782 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002783
2784 if (byteorder == -1) {
2785 /* force LE */
2786 ihi = 1;
2787 ilo = 0;
2788 }
2789 else if (byteorder == 1) {
2790 /* force BE */
2791 ihi = 0;
2792 ilo = 1;
2793 }
2794
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002795 while (size-- > 0) {
2796 Py_UNICODE ch = *s++;
2797 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002798#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002799 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002800 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2801 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002803#endif
Tim Peters772747b2001-08-09 22:21:55 +00002804 STORECHAR(ch);
2805 if (ch2)
2806 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002807 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002808
2809 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002810 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002811 Py_DECREF(v);
2812 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002813#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814}
2815
2816PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2817{
2818 if (!PyUnicode_Check(unicode)) {
2819 PyErr_BadArgument();
2820 return NULL;
2821 }
2822 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2823 PyUnicode_GET_SIZE(unicode),
2824 NULL,
2825 0);
2826}
2827
2828/* --- Unicode Escape Codec ----------------------------------------------- */
2829
Fredrik Lundh06d12682001-01-24 07:59:11 +00002830static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002831
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002833 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 const char *errors)
2835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 Py_ssize_t startinpos;
2838 Py_ssize_t endinpos;
2839 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002844 char* message;
2845 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 PyObject *errorHandler = NULL;
2847 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002848
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 /* Escaped strings will always be longer than the resulting
2850 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 length after conversion to the true value.
2852 (but if the error callback returns a long replacement string
2853 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 v = _PyUnicode_New(size);
2855 if (v == NULL)
2856 goto onError;
2857 if (size == 0)
2858 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002862
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 while (s < end) {
2864 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002865 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867
2868 /* Non-escape characters are interpreted as Unicode ordinals */
2869 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 continue;
2872 }
2873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 /* \ - Escapes */
2876 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002877 c = *s++;
2878 if (s > end)
2879 c = '\0'; /* Invalid after \ */
2880 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881
2882 /* \x escapes */
2883 case '\n': break;
2884 case '\\': *p++ = '\\'; break;
2885 case '\'': *p++ = '\''; break;
2886 case '\"': *p++ = '\"'; break;
2887 case 'b': *p++ = '\b'; break;
2888 case 'f': *p++ = '\014'; break; /* FF */
2889 case 't': *p++ = '\t'; break;
2890 case 'n': *p++ = '\n'; break;
2891 case 'r': *p++ = '\r'; break;
2892 case 'v': *p++ = '\013'; break; /* VT */
2893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2894
2895 /* \OOO (octal) escapes */
2896 case '0': case '1': case '2': case '3':
2897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002898 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002899 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002900 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002901 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 break;
2906
Fredrik Lundhccc74732001-02-18 22:13:49 +00002907 /* hex escapes */
2908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 digits = 2;
2911 message = "truncated \\xXX escape";
2912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 digits = 4;
2917 message = "truncated \\uXXXX escape";
2918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002922 digits = 8;
2923 message = "truncated \\UXXXXXXXX escape";
2924 hexescape:
2925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002926 outpos = p-PyUnicode_AS_UNICODE(v);
2927 if (s+digits>end) {
2928 endinpos = size;
2929 if (unicode_decode_call_errorhandler(
2930 errors, &errorHandler,
2931 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002932 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 (PyObject **)&v, &outpos, &p))
2934 goto onError;
2935 goto nextByte;
2936 }
2937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002938 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002939 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 endinpos = (s+i+1)-starts;
2941 if (unicode_decode_call_errorhandler(
2942 errors, &errorHandler,
2943 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002944 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002948 }
2949 chr = (chr<<4) & ~0xF;
2950 if (c >= '0' && c <= '9')
2951 chr += c - '0';
2952 else if (c >= 'a' && c <= 'f')
2953 chr += 10 + c - 'a';
2954 else
2955 chr += 10 + c - 'A';
2956 }
2957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 /* _decoding_error will have already written into the
2960 target buffer. */
2961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002963 /* when we get here, chr is a 32-bit unicode character */
2964 if (chr <= 0xffff)
2965 /* UCS-2 character */
2966 *p++ = (Py_UNICODE) chr;
2967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002971 *p++ = chr;
2972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002973 chr -= 0x10000L;
2974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 endinpos = s-starts;
2979 outpos = p-PyUnicode_AS_UNICODE(v);
2980 if (unicode_decode_call_errorhandler(
2981 errors, &errorHandler,
2982 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002983 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002985 goto onError;
2986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002987 break;
2988
2989 /* \N{name} */
2990 case 'N':
2991 message = "malformed \\N character escape";
2992 if (ucnhash_CAPI == NULL) {
2993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002994 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002995 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002996 if (m == NULL)
2997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003004 if (ucnhash_CAPI == NULL)
3005 goto ucnhashError;
3006 }
3007 if (*s == '{') {
3008 const char *start = s+1;
3009 /* look for the closing brace */
3010 while (*s != '}' && s < end)
3011 s++;
3012 if (s > start && s < end && *s == '}') {
3013 /* found a name. look it up in the unicode database */
3014 message = "unknown Unicode character name";
3015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003017 goto store;
3018 }
3019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 endinpos = s-starts;
3021 outpos = p-PyUnicode_AS_UNICODE(v);
3022 if (unicode_decode_call_errorhandler(
3023 errors, &errorHandler,
3024 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003025 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003028 break;
3029
3030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 message = "\\ at end of string";
3033 s--;
3034 endinpos = s-starts;
3035 outpos = p-PyUnicode_AS_UNICODE(v);
3036 if (unicode_decode_call_errorhandler(
3037 errors, &errorHandler,
3038 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003039 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003041 goto onError;
3042 }
3043 else {
3044 *p++ = '\\';
3045 *p++ = (unsigned char)s[-1];
3046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 nextByte:
3050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003057
Fredrik Lundhccc74732001-02-18 22:13:49 +00003058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003059 PyErr_SetString(
3060 PyExc_UnicodeError,
3061 "\\N escapes not supported (can't load unicodedata module)"
3062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 Py_XDECREF(errorHandler);
3065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003066 return NULL;
3067
Fredrik Lundhccc74732001-02-18 22:13:49 +00003068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 return NULL;
3073}
3074
3075/* Return a Unicode-Escape string version of the Unicode object.
3076
3077 If quotes is true, the string is enclosed in u"" or u'' quotes as
3078 appropriate.
3079
3080*/
3081
Thomas Wouters477c8d52006-05-27 19:21:47 +00003082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3083 Py_ssize_t size,
3084 Py_UNICODE ch)
3085{
3086 /* like wcschr, but doesn't stop at NULL characters */
3087
3088 while (size-- > 0) {
3089 if (*s == ch)
3090 return s;
3091 s++;
3092 }
3093
3094 return NULL;
3095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003096
Walter Dörwald79e913e2007-05-12 11:08:06 +00003097static const char *hexdigits = "0123456789abcdef";
3098
3099PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003102 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003105#ifdef Py_UNICODE_WIDE
3106 const Py_ssize_t expandsize = 10;
3107#else
3108 const Py_ssize_t expandsize = 6;
3109#endif
3110
Thomas Wouters89f507f2006-12-13 04:49:30 +00003111 /* XXX(nnorwitz): rather than over-allocating, it would be
3112 better to choose a different scheme. Perhaps scan the
3113 first N-chars of the string and allocate based on that size.
3114 */
3115 /* Initial allocation is based on the longest-possible unichr
3116 escape.
3117
3118 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3119 unichr, so in this case it's the longest unichr escape. In
3120 narrow (UTF-16) builds this is five chars per source unichr
3121 since there are two unichrs in the surrogate pair, so in narrow
3122 (UTF-16) builds it's not the longest unichr escape.
3123
3124 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3125 so in the narrow (UTF-16) build case it's the longest unichr
3126 escape.
3127 */
3128
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003129 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3130 return PyErr_NoMemory();
3131
Christian Heimes9c4756e2008-05-26 13:22:05 +00003132 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003133 2
3134 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003135 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 if (repr == NULL)
3137 return NULL;
3138
Christian Heimes9c4756e2008-05-26 13:22:05 +00003139 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 while (size-- > 0) {
3142 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003143
Walter Dörwald79e913e2007-05-12 11:08:06 +00003144 /* Escape backslashes */
3145 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 *p++ = '\\';
3147 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003148 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003149 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003150
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003151#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003152 /* Map 21-bit characters to '\U00xxxxxx' */
3153 else if (ch >= 0x10000) {
3154 *p++ = '\\';
3155 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003156 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3157 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3158 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3159 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3160 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3161 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3162 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3163 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003164 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003165 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003166#else
3167 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003168 else if (ch >= 0xD800 && ch < 0xDC00) {
3169 Py_UNICODE ch2;
3170 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003171
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003172 ch2 = *s++;
3173 size--;
3174 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3175 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3176 *p++ = '\\';
3177 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003178 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3179 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3180 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3181 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3182 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3183 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3184 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3185 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003186 continue;
3187 }
3188 /* Fall through: isolated surrogates are copied as-is */
3189 s--;
3190 size++;
3191 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003192#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003193
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003195 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 *p++ = '\\';
3197 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003198 *p++ = hexdigits[(ch >> 12) & 0x000F];
3199 *p++ = hexdigits[(ch >> 8) & 0x000F];
3200 *p++ = hexdigits[(ch >> 4) & 0x000F];
3201 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003203
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003204 /* Map special whitespace to '\t', \n', '\r' */
3205 else if (ch == '\t') {
3206 *p++ = '\\';
3207 *p++ = 't';
3208 }
3209 else if (ch == '\n') {
3210 *p++ = '\\';
3211 *p++ = 'n';
3212 }
3213 else if (ch == '\r') {
3214 *p++ = '\\';
3215 *p++ = 'r';
3216 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003217
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003218 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003219 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003221 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003222 *p++ = hexdigits[(ch >> 4) & 0x000F];
3223 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003224 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003225
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 /* Copy everything else as-is */
3227 else
3228 *p++ = (char) ch;
3229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230
Christian Heimes72b710a2008-05-26 13:28:38 +00003231 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003232 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003233 Py_DECREF(repr);
3234 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235}
3236
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3238{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003239 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 if (!PyUnicode_Check(unicode)) {
3241 PyErr_BadArgument();
3242 return NULL;
3243 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003244 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3245 PyUnicode_GET_SIZE(unicode));
3246
3247 if (!s)
3248 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003249 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003250 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003251 Py_DECREF(s);
3252 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253}
3254
3255/* --- Raw Unicode Escape Codec ------------------------------------------- */
3256
3257PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003258 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 const char *errors)
3260{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003262 Py_ssize_t startinpos;
3263 Py_ssize_t endinpos;
3264 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 const char *end;
3268 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 PyObject *errorHandler = NULL;
3270 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003271
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 /* Escaped strings will always be longer than the resulting
3273 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 length after conversion to the true value. (But decoding error
3275 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 v = _PyUnicode_New(size);
3277 if (v == NULL)
3278 goto onError;
3279 if (size == 0)
3280 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 end = s + size;
3283 while (s < end) {
3284 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003285 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003287 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288
3289 /* Non-escape characters are interpreted as Unicode ordinals */
3290 if (*s != '\\') {
3291 *p++ = (unsigned char)*s++;
3292 continue;
3293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295
3296 /* \u-escapes are only interpreted iff the number of leading
3297 backslashes if odd */
3298 bs = s;
3299 for (;s < end;) {
3300 if (*s != '\\')
3301 break;
3302 *p++ = (unsigned char)*s++;
3303 }
3304 if (((s - bs) & 1) == 0 ||
3305 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003306 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 continue;
3308 }
3309 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003310 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 s++;
3312
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003313 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003315 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 endinpos = s-starts;
3319 if (unicode_decode_call_errorhandler(
3320 errors, &errorHandler,
3321 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003322 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 }
3327 x = (x<<4) & ~0xF;
3328 if (c >= '0' && c <= '9')
3329 x += c - '0';
3330 else if (c >= 'a' && c <= 'f')
3331 x += 10 + c - 'a';
3332 else
3333 x += 10 + c - 'A';
3334 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003335 if (x <= 0xffff)
3336 /* UCS-2 character */
3337 *p++ = (Py_UNICODE) x;
3338 else if (x <= 0x10ffff) {
3339 /* UCS-4 character. Either store directly, or as
3340 surrogate pair. */
3341#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003342 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003343#else
3344 x -= 0x10000L;
3345 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3346 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3347#endif
3348 } else {
3349 endinpos = s-starts;
3350 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003351 if (unicode_decode_call_errorhandler(
3352 errors, &errorHandler,
3353 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003354 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003355 (PyObject **)&v, &outpos, &p))
3356 goto onError;
3357 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 nextByte:
3359 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003361 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003362 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 Py_XDECREF(errorHandler);
3364 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003366
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 onError:
3368 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 Py_XDECREF(errorHandler);
3370 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 return NULL;
3372}
3373
3374PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003377 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 char *p;
3379 char *q;
3380
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003381#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003382 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003383#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003384 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003385#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003386
3387 if (size > PY_SSIZE_T_MAX / expandsize)
3388 return PyErr_NoMemory();
3389
3390 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 if (repr == NULL)
3392 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003393 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003394 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395
Christian Heimes9c4756e2008-05-26 13:22:05 +00003396 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 while (size-- > 0) {
3398 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003399#ifdef Py_UNICODE_WIDE
3400 /* Map 32-bit characters to '\Uxxxxxxxx' */
3401 if (ch >= 0x10000) {
3402 *p++ = '\\';
3403 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003404 *p++ = hexdigits[(ch >> 28) & 0xf];
3405 *p++ = hexdigits[(ch >> 24) & 0xf];
3406 *p++ = hexdigits[(ch >> 20) & 0xf];
3407 *p++ = hexdigits[(ch >> 16) & 0xf];
3408 *p++ = hexdigits[(ch >> 12) & 0xf];
3409 *p++ = hexdigits[(ch >> 8) & 0xf];
3410 *p++ = hexdigits[(ch >> 4) & 0xf];
3411 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003412 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003413 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003414#else
3415 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3416 if (ch >= 0xD800 && ch < 0xDC00) {
3417 Py_UNICODE ch2;
3418 Py_UCS4 ucs;
3419
3420 ch2 = *s++;
3421 size--;
3422 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3423 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3424 *p++ = '\\';
3425 *p++ = 'U';
3426 *p++ = hexdigits[(ucs >> 28) & 0xf];
3427 *p++ = hexdigits[(ucs >> 24) & 0xf];
3428 *p++ = hexdigits[(ucs >> 20) & 0xf];
3429 *p++ = hexdigits[(ucs >> 16) & 0xf];
3430 *p++ = hexdigits[(ucs >> 12) & 0xf];
3431 *p++ = hexdigits[(ucs >> 8) & 0xf];
3432 *p++ = hexdigits[(ucs >> 4) & 0xf];
3433 *p++ = hexdigits[ucs & 0xf];
3434 continue;
3435 }
3436 /* Fall through: isolated surrogates are copied as-is */
3437 s--;
3438 size++;
3439 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003440#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 /* Map 16-bit characters to '\uxxxx' */
3442 if (ch >= 256) {
3443 *p++ = '\\';
3444 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003445 *p++ = hexdigits[(ch >> 12) & 0xf];
3446 *p++ = hexdigits[(ch >> 8) & 0xf];
3447 *p++ = hexdigits[(ch >> 4) & 0xf];
3448 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 }
3450 /* Copy everything else as-is */
3451 else
3452 *p++ = (char) ch;
3453 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003454 size = p - q;
3455
3456 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003457 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003458 Py_DECREF(repr);
3459 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460}
3461
3462PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3463{
Walter Dörwald711005d2007-05-12 12:03:26 +00003464 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003466 PyErr_BadArgument();
3467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003469 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3470 PyUnicode_GET_SIZE(unicode));
3471
3472 if (!s)
3473 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003474 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003475 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003476 Py_DECREF(s);
3477 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478}
3479
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003480/* --- Unicode Internal Codec ------------------------------------------- */
3481
3482PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003484 const char *errors)
3485{
3486 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003487 Py_ssize_t startinpos;
3488 Py_ssize_t endinpos;
3489 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003490 PyUnicodeObject *v;
3491 Py_UNICODE *p;
3492 const char *end;
3493 const char *reason;
3494 PyObject *errorHandler = NULL;
3495 PyObject *exc = NULL;
3496
Neal Norwitzd43069c2006-01-08 01:12:10 +00003497#ifdef Py_UNICODE_WIDE
3498 Py_UNICODE unimax = PyUnicode_GetMax();
3499#endif
3500
Thomas Wouters89f507f2006-12-13 04:49:30 +00003501 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003502 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3503 if (v == NULL)
3504 goto onError;
3505 if (PyUnicode_GetSize((PyObject *)v) == 0)
3506 return (PyObject *)v;
3507 p = PyUnicode_AS_UNICODE(v);
3508 end = s + size;
3509
3510 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003511 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003512 /* We have to sanity check the raw data, otherwise doom looms for
3513 some malformed UCS-4 data. */
3514 if (
3515 #ifdef Py_UNICODE_WIDE
3516 *p > unimax || *p < 0 ||
3517 #endif
3518 end-s < Py_UNICODE_SIZE
3519 )
3520 {
3521 startinpos = s - starts;
3522 if (end-s < Py_UNICODE_SIZE) {
3523 endinpos = end-starts;
3524 reason = "truncated input";
3525 }
3526 else {
3527 endinpos = s - starts + Py_UNICODE_SIZE;
3528 reason = "illegal code point (> 0x10FFFF)";
3529 }
3530 outpos = p - PyUnicode_AS_UNICODE(v);
3531 if (unicode_decode_call_errorhandler(
3532 errors, &errorHandler,
3533 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003534 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003535 (PyObject **)&v, &outpos, &p)) {
3536 goto onError;
3537 }
3538 }
3539 else {
3540 p++;
3541 s += Py_UNICODE_SIZE;
3542 }
3543 }
3544
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003545 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003546 goto onError;
3547 Py_XDECREF(errorHandler);
3548 Py_XDECREF(exc);
3549 return (PyObject *)v;
3550
3551 onError:
3552 Py_XDECREF(v);
3553 Py_XDECREF(errorHandler);
3554 Py_XDECREF(exc);
3555 return NULL;
3556}
3557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558/* --- Latin-1 Codec ------------------------------------------------------ */
3559
3560PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 const char *errors)
3563{
3564 PyUnicodeObject *v;
3565 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003568 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003569 Py_UNICODE r = *(unsigned char*)s;
3570 return PyUnicode_FromUnicode(&r, 1);
3571 }
3572
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 v = _PyUnicode_New(size);
3574 if (v == NULL)
3575 goto onError;
3576 if (size == 0)
3577 return (PyObject *)v;
3578 p = PyUnicode_AS_UNICODE(v);
3579 while (size-- > 0)
3580 *p++ = (unsigned char)*s++;
3581 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003582
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 onError:
3584 Py_XDECREF(v);
3585 return NULL;
3586}
3587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588/* create or adjust a UnicodeEncodeError */
3589static void make_encode_exception(PyObject **exceptionObject,
3590 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003591 const Py_UNICODE *unicode, Py_ssize_t size,
3592 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 if (*exceptionObject == NULL) {
3596 *exceptionObject = PyUnicodeEncodeError_Create(
3597 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 }
3599 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3601 goto onError;
3602 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3603 goto onError;
3604 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3605 goto onError;
3606 return;
3607 onError:
3608 Py_DECREF(*exceptionObject);
3609 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 }
3611}
3612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613/* raises a UnicodeEncodeError */
3614static void raise_encode_exception(PyObject **exceptionObject,
3615 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 const Py_UNICODE *unicode, Py_ssize_t size,
3617 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 const char *reason)
3619{
3620 make_encode_exception(exceptionObject,
3621 encoding, unicode, size, startpos, endpos, reason);
3622 if (*exceptionObject != NULL)
3623 PyCodec_StrictErrors(*exceptionObject);
3624}
3625
3626/* error handling callback helper:
3627 build arguments, call the callback and check the arguments,
3628 put the result into newpos and return the replacement string, which
3629 has to be freed by the caller */
3630static PyObject *unicode_encode_call_errorhandler(const char *errors,
3631 PyObject **errorHandler,
3632 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003633 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3634 Py_ssize_t startpos, Py_ssize_t endpos,
3635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003637 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638
3639 PyObject *restuple;
3640 PyObject *resunicode;
3641
3642 if (*errorHandler == NULL) {
3643 *errorHandler = PyCodec_LookupError(errors);
3644 if (*errorHandler == NULL)
3645 return NULL;
3646 }
3647
3648 make_encode_exception(exceptionObject,
3649 encoding, unicode, size, startpos, endpos, reason);
3650 if (*exceptionObject == NULL)
3651 return NULL;
3652
3653 restuple = PyObject_CallFunctionObjArgs(
3654 *errorHandler, *exceptionObject, NULL);
3655 if (restuple == NULL)
3656 return NULL;
3657 if (!PyTuple_Check(restuple)) {
3658 PyErr_Format(PyExc_TypeError, &argparse[4]);
3659 Py_DECREF(restuple);
3660 return NULL;
3661 }
3662 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3663 &resunicode, newpos)) {
3664 Py_DECREF(restuple);
3665 return NULL;
3666 }
3667 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003668 *newpos = size+*newpos;
3669 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003670 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003671 Py_DECREF(restuple);
3672 return NULL;
3673 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 Py_INCREF(resunicode);
3675 Py_DECREF(restuple);
3676 return resunicode;
3677}
3678
3679static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003680 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 const char *errors,
3682 int limit)
3683{
3684 /* output object */
3685 PyObject *res;
3686 /* pointers to the beginning and end+1 of input */
3687 const Py_UNICODE *startp = p;
3688 const Py_UNICODE *endp = p + size;
3689 /* pointer to the beginning of the unencodable characters */
3690 /* const Py_UNICODE *badp = NULL; */
3691 /* pointer into the output */
3692 char *str;
3693 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003694 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003695 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3696 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 PyObject *errorHandler = NULL;
3698 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003699 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 /* the following variable is used for caching string comparisons
3701 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3702 int known_errorHandler = -1;
3703
3704 /* allocate enough for a simple encoding without
3705 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003706 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003707 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003708 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003709 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003710 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003711 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 ressize = size;
3713
3714 while (p<endp) {
3715 Py_UNICODE c = *p;
3716
3717 /* can we encode this? */
3718 if (c<limit) {
3719 /* no overflow check, because we know that the space is enough */
3720 *str++ = (char)c;
3721 ++p;
3722 }
3723 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003724 Py_ssize_t unicodepos = p-startp;
3725 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003727 Py_ssize_t repsize;
3728 Py_ssize_t newpos;
3729 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_UNICODE *uni2;
3731 /* startpos for collecting unencodable chars */
3732 const Py_UNICODE *collstart = p;
3733 const Py_UNICODE *collend = p;
3734 /* find all unecodable characters */
3735 while ((collend < endp) && ((*collend)>=limit))
3736 ++collend;
3737 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3738 if (known_errorHandler==-1) {
3739 if ((errors==NULL) || (!strcmp(errors, "strict")))
3740 known_errorHandler = 1;
3741 else if (!strcmp(errors, "replace"))
3742 known_errorHandler = 2;
3743 else if (!strcmp(errors, "ignore"))
3744 known_errorHandler = 3;
3745 else if (!strcmp(errors, "xmlcharrefreplace"))
3746 known_errorHandler = 4;
3747 else
3748 known_errorHandler = 0;
3749 }
3750 switch (known_errorHandler) {
3751 case 1: /* strict */
3752 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3753 goto onError;
3754 case 2: /* replace */
3755 while (collstart++<collend)
3756 *str++ = '?'; /* fall through */
3757 case 3: /* ignore */
3758 p = collend;
3759 break;
3760 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003761 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 /* determine replacement size (temporarily (mis)uses p) */
3763 for (p = collstart, repsize = 0; p < collend; ++p) {
3764 if (*p<10)
3765 repsize += 2+1+1;
3766 else if (*p<100)
3767 repsize += 2+2+1;
3768 else if (*p<1000)
3769 repsize += 2+3+1;
3770 else if (*p<10000)
3771 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003772#ifndef Py_UNICODE_WIDE
3773 else
3774 repsize += 2+5+1;
3775#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 else if (*p<100000)
3777 repsize += 2+5+1;
3778 else if (*p<1000000)
3779 repsize += 2+6+1;
3780 else
3781 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003782#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 }
3784 requiredsize = respos+repsize+(endp-collend);
3785 if (requiredsize > ressize) {
3786 if (requiredsize<2*ressize)
3787 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003788 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003790 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 ressize = requiredsize;
3792 }
3793 /* generate replacement (temporarily (mis)uses p) */
3794 for (p = collstart; p < collend; ++p) {
3795 str += sprintf(str, "&#%d;", (int)*p);
3796 }
3797 p = collend;
3798 break;
3799 default:
3800 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3801 encoding, reason, startp, size, &exc,
3802 collstart-startp, collend-startp, &newpos);
3803 if (repunicode == NULL)
3804 goto onError;
3805 /* need more space? (at least enough for what we
3806 have+the replacement+the rest of the string, so
3807 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003808 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 repsize = PyUnicode_GET_SIZE(repunicode);
3810 requiredsize = respos+repsize+(endp-collend);
3811 if (requiredsize > ressize) {
3812 if (requiredsize<2*ressize)
3813 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003814 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_DECREF(repunicode);
3816 goto onError;
3817 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003818 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 ressize = requiredsize;
3820 }
3821 /* check if there is anything unencodable in the replacement
3822 and copy it to the output */
3823 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3824 c = *uni2;
3825 if (c >= limit) {
3826 raise_encode_exception(&exc, encoding, startp, size,
3827 unicodepos, unicodepos+1, reason);
3828 Py_DECREF(repunicode);
3829 goto onError;
3830 }
3831 *str = (char)c;
3832 }
3833 p = startp + newpos;
3834 Py_DECREF(repunicode);
3835 }
3836 }
3837 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003838 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003839 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003840 onError:
3841 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 Py_XDECREF(errorHandler);
3843 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003844 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845}
3846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 const char *errors)
3850{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852}
3853
3854PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3855{
3856 if (!PyUnicode_Check(unicode)) {
3857 PyErr_BadArgument();
3858 return NULL;
3859 }
3860 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3861 PyUnicode_GET_SIZE(unicode),
3862 NULL);
3863}
3864
3865/* --- 7-bit ASCII Codec -------------------------------------------------- */
3866
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003868 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 const char *errors)
3870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 PyUnicodeObject *v;
3873 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t startinpos;
3875 Py_ssize_t endinpos;
3876 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 const char *e;
3878 PyObject *errorHandler = NULL;
3879 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003882 if (size == 1 && *(unsigned char*)s < 128) {
3883 Py_UNICODE r = *(unsigned char*)s;
3884 return PyUnicode_FromUnicode(&r, 1);
3885 }
Tim Petersced69f82003-09-16 20:30:58 +00003886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 v = _PyUnicode_New(size);
3888 if (v == NULL)
3889 goto onError;
3890 if (size == 0)
3891 return (PyObject *)v;
3892 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 e = s + size;
3894 while (s < e) {
3895 register unsigned char c = (unsigned char)*s;
3896 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 ++s;
3899 }
3900 else {
3901 startinpos = s-starts;
3902 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003903 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 if (unicode_decode_call_errorhandler(
3905 errors, &errorHandler,
3906 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003907 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003912 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003913 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003914 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 Py_XDECREF(errorHandler);
3916 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003918
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 onError:
3920 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 Py_XDECREF(errorHandler);
3922 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 return NULL;
3924}
3925
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 const char *errors)
3929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931}
3932
3933PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3934{
3935 if (!PyUnicode_Check(unicode)) {
3936 PyErr_BadArgument();
3937 return NULL;
3938 }
3939 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3940 PyUnicode_GET_SIZE(unicode),
3941 NULL);
3942}
3943
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003944#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003945
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003946/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003947
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003948#if SIZEOF_INT < SIZEOF_SSIZE_T
3949#define NEED_RETRY
3950#endif
3951
3952/* XXX This code is limited to "true" double-byte encodings, as
3953 a) it assumes an incomplete character consists of a single byte, and
3954 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3955 encodings, see IsDBCSLeadByteEx documentation. */
3956
3957static int is_dbcs_lead_byte(const char *s, int offset)
3958{
3959 const char *curr = s + offset;
3960
3961 if (IsDBCSLeadByte(*curr)) {
3962 const char *prev = CharPrev(s, curr);
3963 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3964 }
3965 return 0;
3966}
3967
3968/*
3969 * Decode MBCS string into unicode object. If 'final' is set, converts
3970 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3971 */
3972static int decode_mbcs(PyUnicodeObject **v,
3973 const char *s, /* MBCS string */
3974 int size, /* sizeof MBCS string */
3975 int final)
3976{
3977 Py_UNICODE *p;
3978 Py_ssize_t n = 0;
3979 int usize = 0;
3980
3981 assert(size >= 0);
3982
3983 /* Skip trailing lead-byte unless 'final' is set */
3984 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3985 --size;
3986
3987 /* First get the size of the result */
3988 if (size > 0) {
3989 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3990 if (usize == 0) {
3991 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3992 return -1;
3993 }
3994 }
3995
3996 if (*v == NULL) {
3997 /* Create unicode object */
3998 *v = _PyUnicode_New(usize);
3999 if (*v == NULL)
4000 return -1;
4001 }
4002 else {
4003 /* Extend unicode object */
4004 n = PyUnicode_GET_SIZE(*v);
4005 if (_PyUnicode_Resize(v, n + usize) < 0)
4006 return -1;
4007 }
4008
4009 /* Do the conversion */
4010 if (size > 0) {
4011 p = PyUnicode_AS_UNICODE(*v) + n;
4012 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4013 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4014 return -1;
4015 }
4016 }
4017
4018 return size;
4019}
4020
4021PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4022 Py_ssize_t size,
4023 const char *errors,
4024 Py_ssize_t *consumed)
4025{
4026 PyUnicodeObject *v = NULL;
4027 int done;
4028
4029 if (consumed)
4030 *consumed = 0;
4031
4032#ifdef NEED_RETRY
4033 retry:
4034 if (size > INT_MAX)
4035 done = decode_mbcs(&v, s, INT_MAX, 0);
4036 else
4037#endif
4038 done = decode_mbcs(&v, s, (int)size, !consumed);
4039
4040 if (done < 0) {
4041 Py_XDECREF(v);
4042 return NULL;
4043 }
4044
4045 if (consumed)
4046 *consumed += done;
4047
4048#ifdef NEED_RETRY
4049 if (size > INT_MAX) {
4050 s += done;
4051 size -= done;
4052 goto retry;
4053 }
4054#endif
4055
4056 return (PyObject *)v;
4057}
4058
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004059PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004061 const char *errors)
4062{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004063 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4064}
4065
4066/*
4067 * Convert unicode into string object (MBCS).
4068 * Returns 0 if succeed, -1 otherwise.
4069 */
4070static int encode_mbcs(PyObject **repr,
4071 const Py_UNICODE *p, /* unicode */
4072 int size) /* size of unicode */
4073{
4074 int mbcssize = 0;
4075 Py_ssize_t n = 0;
4076
4077 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078
4079 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004080 if (size > 0) {
4081 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4082 if (mbcssize == 0) {
4083 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4084 return -1;
4085 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004086 }
4087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004088 if (*repr == NULL) {
4089 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004090 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004091 if (*repr == NULL)
4092 return -1;
4093 }
4094 else {
4095 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004096 n = PyBytes_Size(*repr);
4097 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004098 return -1;
4099 }
4100
4101 /* Do the conversion */
4102 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004104 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4105 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4106 return -1;
4107 }
4108 }
4109
4110 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004111}
4112
4113PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004115 const char *errors)
4116{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004117 PyObject *repr = NULL;
4118 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004119
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004120#ifdef NEED_RETRY
4121 retry:
4122 if (size > INT_MAX)
4123 ret = encode_mbcs(&repr, p, INT_MAX);
4124 else
4125#endif
4126 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004128 if (ret < 0) {
4129 Py_XDECREF(repr);
4130 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004131 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004132
4133#ifdef NEED_RETRY
4134 if (size > INT_MAX) {
4135 p += INT_MAX;
4136 size -= INT_MAX;
4137 goto retry;
4138 }
4139#endif
4140
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004141 return repr;
4142}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004143
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004144PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4145{
4146 if (!PyUnicode_Check(unicode)) {
4147 PyErr_BadArgument();
4148 return NULL;
4149 }
4150 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4151 PyUnicode_GET_SIZE(unicode),
4152 NULL);
4153}
4154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004155#undef NEED_RETRY
4156
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004157#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159/* --- Character Mapping Codec -------------------------------------------- */
4160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 PyObject *mapping,
4164 const char *errors)
4165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t startinpos;
4168 Py_ssize_t endinpos;
4169 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 PyUnicodeObject *v;
4172 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004173 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 PyObject *errorHandler = NULL;
4175 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004176 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004177 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 /* Default to Latin-1 */
4180 if (mapping == NULL)
4181 return PyUnicode_DecodeLatin1(s, size, errors);
4182
4183 v = _PyUnicode_New(size);
4184 if (v == NULL)
4185 goto onError;
4186 if (size == 0)
4187 return (PyObject *)v;
4188 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004190 if (PyUnicode_CheckExact(mapping)) {
4191 mapstring = PyUnicode_AS_UNICODE(mapping);
4192 maplen = PyUnicode_GET_SIZE(mapping);
4193 while (s < e) {
4194 unsigned char ch = *s;
4195 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004197 if (ch < maplen)
4198 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004200 if (x == 0xfffe) {
4201 /* undefined mapping */
4202 outpos = p-PyUnicode_AS_UNICODE(v);
4203 startinpos = s-starts;
4204 endinpos = startinpos+1;
4205 if (unicode_decode_call_errorhandler(
4206 errors, &errorHandler,
4207 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004208 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004209 (PyObject **)&v, &outpos, &p)) {
4210 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004211 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004212 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004213 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004214 *p++ = x;
4215 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004217 }
4218 else {
4219 while (s < e) {
4220 unsigned char ch = *s;
4221 PyObject *w, *x;
4222
4223 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004224 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004225 if (w == NULL)
4226 goto onError;
4227 x = PyObject_GetItem(mapping, w);
4228 Py_DECREF(w);
4229 if (x == NULL) {
4230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4231 /* No mapping found means: mapping is undefined. */
4232 PyErr_Clear();
4233 x = Py_None;
4234 Py_INCREF(x);
4235 } else
4236 goto onError;
4237 }
4238
4239 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004240 if (PyLong_Check(x)) {
4241 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004242 if (value < 0 || value > 65535) {
4243 PyErr_SetString(PyExc_TypeError,
4244 "character mapping must be in range(65536)");
4245 Py_DECREF(x);
4246 goto onError;
4247 }
4248 *p++ = (Py_UNICODE)value;
4249 }
4250 else if (x == Py_None) {
4251 /* undefined mapping */
4252 outpos = p-PyUnicode_AS_UNICODE(v);
4253 startinpos = s-starts;
4254 endinpos = startinpos+1;
4255 if (unicode_decode_call_errorhandler(
4256 errors, &errorHandler,
4257 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004258 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004259 (PyObject **)&v, &outpos, &p)) {
4260 Py_DECREF(x);
4261 goto onError;
4262 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004263 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004264 continue;
4265 }
4266 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004267 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004268
4269 if (targetsize == 1)
4270 /* 1-1 mapping */
4271 *p++ = *PyUnicode_AS_UNICODE(x);
4272
4273 else if (targetsize > 1) {
4274 /* 1-n mapping */
4275 if (targetsize > extrachars) {
4276 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4278 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004279 (targetsize << 2);
4280 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004281 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004282 if (_PyUnicode_Resize(&v,
4283 PyUnicode_GET_SIZE(v) + needed) < 0) {
4284 Py_DECREF(x);
4285 goto onError;
4286 }
4287 p = PyUnicode_AS_UNICODE(v) + oldpos;
4288 }
4289 Py_UNICODE_COPY(p,
4290 PyUnicode_AS_UNICODE(x),
4291 targetsize);
4292 p += targetsize;
4293 extrachars -= targetsize;
4294 }
4295 /* 1-0 mapping: skip the character */
4296 }
4297 else {
4298 /* wrong return value */
4299 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004300 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004301 Py_DECREF(x);
4302 goto onError;
4303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004305 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 }
4308 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004309 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 Py_XDECREF(errorHandler);
4312 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004314
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 Py_XDECREF(errorHandler);
4317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 Py_XDECREF(v);
4319 return NULL;
4320}
4321
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004322/* Charmap encoding: the lookup table */
4323
4324struct encoding_map{
4325 PyObject_HEAD
4326 unsigned char level1[32];
4327 int count2, count3;
4328 unsigned char level23[1];
4329};
4330
4331static PyObject*
4332encoding_map_size(PyObject *obj, PyObject* args)
4333{
4334 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004335 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004336 128*map->count3);
4337}
4338
4339static PyMethodDef encoding_map_methods[] = {
4340 {"size", encoding_map_size, METH_NOARGS,
4341 PyDoc_STR("Return the size (in bytes) of this object") },
4342 { 0 }
4343};
4344
4345static void
4346encoding_map_dealloc(PyObject* o)
4347{
4348 PyObject_FREE(o);
4349}
4350
4351static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004352 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004353 "EncodingMap", /*tp_name*/
4354 sizeof(struct encoding_map), /*tp_basicsize*/
4355 0, /*tp_itemsize*/
4356 /* methods */
4357 encoding_map_dealloc, /*tp_dealloc*/
4358 0, /*tp_print*/
4359 0, /*tp_getattr*/
4360 0, /*tp_setattr*/
4361 0, /*tp_compare*/
4362 0, /*tp_repr*/
4363 0, /*tp_as_number*/
4364 0, /*tp_as_sequence*/
4365 0, /*tp_as_mapping*/
4366 0, /*tp_hash*/
4367 0, /*tp_call*/
4368 0, /*tp_str*/
4369 0, /*tp_getattro*/
4370 0, /*tp_setattro*/
4371 0, /*tp_as_buffer*/
4372 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4373 0, /*tp_doc*/
4374 0, /*tp_traverse*/
4375 0, /*tp_clear*/
4376 0, /*tp_richcompare*/
4377 0, /*tp_weaklistoffset*/
4378 0, /*tp_iter*/
4379 0, /*tp_iternext*/
4380 encoding_map_methods, /*tp_methods*/
4381 0, /*tp_members*/
4382 0, /*tp_getset*/
4383 0, /*tp_base*/
4384 0, /*tp_dict*/
4385 0, /*tp_descr_get*/
4386 0, /*tp_descr_set*/
4387 0, /*tp_dictoffset*/
4388 0, /*tp_init*/
4389 0, /*tp_alloc*/
4390 0, /*tp_new*/
4391 0, /*tp_free*/
4392 0, /*tp_is_gc*/
4393};
4394
4395PyObject*
4396PyUnicode_BuildEncodingMap(PyObject* string)
4397{
4398 Py_UNICODE *decode;
4399 PyObject *result;
4400 struct encoding_map *mresult;
4401 int i;
4402 int need_dict = 0;
4403 unsigned char level1[32];
4404 unsigned char level2[512];
4405 unsigned char *mlevel1, *mlevel2, *mlevel3;
4406 int count2 = 0, count3 = 0;
4407
4408 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4409 PyErr_BadArgument();
4410 return NULL;
4411 }
4412 decode = PyUnicode_AS_UNICODE(string);
4413 memset(level1, 0xFF, sizeof level1);
4414 memset(level2, 0xFF, sizeof level2);
4415
4416 /* If there isn't a one-to-one mapping of NULL to \0,
4417 or if there are non-BMP characters, we need to use
4418 a mapping dictionary. */
4419 if (decode[0] != 0)
4420 need_dict = 1;
4421 for (i = 1; i < 256; i++) {
4422 int l1, l2;
4423 if (decode[i] == 0
4424 #ifdef Py_UNICODE_WIDE
4425 || decode[i] > 0xFFFF
4426 #endif
4427 ) {
4428 need_dict = 1;
4429 break;
4430 }
4431 if (decode[i] == 0xFFFE)
4432 /* unmapped character */
4433 continue;
4434 l1 = decode[i] >> 11;
4435 l2 = decode[i] >> 7;
4436 if (level1[l1] == 0xFF)
4437 level1[l1] = count2++;
4438 if (level2[l2] == 0xFF)
4439 level2[l2] = count3++;
4440 }
4441
4442 if (count2 >= 0xFF || count3 >= 0xFF)
4443 need_dict = 1;
4444
4445 if (need_dict) {
4446 PyObject *result = PyDict_New();
4447 PyObject *key, *value;
4448 if (!result)
4449 return NULL;
4450 for (i = 0; i < 256; i++) {
4451 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004452 key = PyLong_FromLong(decode[i]);
4453 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 if (!key || !value)
4455 goto failed1;
4456 if (PyDict_SetItem(result, key, value) == -1)
4457 goto failed1;
4458 Py_DECREF(key);
4459 Py_DECREF(value);
4460 }
4461 return result;
4462 failed1:
4463 Py_XDECREF(key);
4464 Py_XDECREF(value);
4465 Py_DECREF(result);
4466 return NULL;
4467 }
4468
4469 /* Create a three-level trie */
4470 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4471 16*count2 + 128*count3 - 1);
4472 if (!result)
4473 return PyErr_NoMemory();
4474 PyObject_Init(result, &EncodingMapType);
4475 mresult = (struct encoding_map*)result;
4476 mresult->count2 = count2;
4477 mresult->count3 = count3;
4478 mlevel1 = mresult->level1;
4479 mlevel2 = mresult->level23;
4480 mlevel3 = mresult->level23 + 16*count2;
4481 memcpy(mlevel1, level1, 32);
4482 memset(mlevel2, 0xFF, 16*count2);
4483 memset(mlevel3, 0, 128*count3);
4484 count3 = 0;
4485 for (i = 1; i < 256; i++) {
4486 int o1, o2, o3, i2, i3;
4487 if (decode[i] == 0xFFFE)
4488 /* unmapped character */
4489 continue;
4490 o1 = decode[i]>>11;
4491 o2 = (decode[i]>>7) & 0xF;
4492 i2 = 16*mlevel1[o1] + o2;
4493 if (mlevel2[i2] == 0xFF)
4494 mlevel2[i2] = count3++;
4495 o3 = decode[i] & 0x7F;
4496 i3 = 128*mlevel2[i2] + o3;
4497 mlevel3[i3] = i;
4498 }
4499 return result;
4500}
4501
4502static int
4503encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4504{
4505 struct encoding_map *map = (struct encoding_map*)mapping;
4506 int l1 = c>>11;
4507 int l2 = (c>>7) & 0xF;
4508 int l3 = c & 0x7F;
4509 int i;
4510
4511#ifdef Py_UNICODE_WIDE
4512 if (c > 0xFFFF) {
4513 return -1;
4514 }
4515#endif
4516 if (c == 0)
4517 return 0;
4518 /* level 1*/
4519 i = map->level1[l1];
4520 if (i == 0xFF) {
4521 return -1;
4522 }
4523 /* level 2*/
4524 i = map->level23[16*i+l2];
4525 if (i == 0xFF) {
4526 return -1;
4527 }
4528 /* level 3 */
4529 i = map->level23[16*map->count2 + 128*i + l3];
4530 if (i == 0) {
4531 return -1;
4532 }
4533 return i;
4534}
4535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536/* Lookup the character ch in the mapping. If the character
4537 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004538 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540{
Christian Heimes217cfd12007-12-02 14:31:20 +00004541 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 PyObject *x;
4543
4544 if (w == NULL)
4545 return NULL;
4546 x = PyObject_GetItem(mapping, w);
4547 Py_DECREF(w);
4548 if (x == NULL) {
4549 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4550 /* No mapping found means: mapping is undefined. */
4551 PyErr_Clear();
4552 x = Py_None;
4553 Py_INCREF(x);
4554 return x;
4555 } else
4556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004558 else if (x == Py_None)
4559 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004560 else if (PyLong_Check(x)) {
4561 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 if (value < 0 || value > 255) {
4563 PyErr_SetString(PyExc_TypeError,
4564 "character mapping must be in range(256)");
4565 Py_DECREF(x);
4566 return NULL;
4567 }
4568 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004570 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004574 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004575 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004576 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 Py_DECREF(x);
4578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
4580}
4581
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004582static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004583charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004584{
Christian Heimes72b710a2008-05-26 13:28:38 +00004585 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004586 /* exponentially overallocate to minimize reallocations */
4587 if (requiredsize < 2*outsize)
4588 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004589 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004590 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004591 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004592}
4593
4594typedef enum charmapencode_result {
4595 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4596}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004598 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 space is available. Return a new reference to the object that
4600 was put in the output buffer, or Py_None, if the mapping was undefined
4601 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004602 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004604charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004605 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004607 PyObject *rep;
4608 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004609 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610
Christian Heimes90aa7642007-12-19 02:45:37 +00004611 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004612 int res = encoding_map_lookup(c, mapping);
4613 Py_ssize_t requiredsize = *outpos+1;
4614 if (res == -1)
4615 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004616 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004617 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004618 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004620 outstart[(*outpos)++] = (char)res;
4621 return enc_SUCCESS;
4622 }
4623
4624 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004626 return enc_EXCEPTION;
4627 else if (rep==Py_None) {
4628 Py_DECREF(rep);
4629 return enc_FAILED;
4630 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004631 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004633 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004634 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004636 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004638 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004639 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 }
4641 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004642 const char *repchars = PyBytes_AS_STRING(rep);
4643 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004645 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004646 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004648 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004650 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 memcpy(outstart + *outpos, repchars, repsize);
4652 *outpos += repsize;
4653 }
4654 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004655 Py_DECREF(rep);
4656 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657}
4658
4659/* handle an error in PyUnicode_EncodeCharmap
4660 Return 0 on success, -1 on error */
4661static
4662int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004663 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004665 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004666 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667{
4668 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 Py_ssize_t repsize;
4670 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671 Py_UNICODE *uni2;
4672 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 Py_ssize_t collstartpos = *inpos;
4674 Py_ssize_t collendpos = *inpos+1;
4675 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 char *encoding = "charmap";
4677 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004678 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 /* find all unencodable characters */
4681 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004682 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004683 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004684 int res = encoding_map_lookup(p[collendpos], mapping);
4685 if (res != -1)
4686 break;
4687 ++collendpos;
4688 continue;
4689 }
4690
4691 rep = charmapencode_lookup(p[collendpos], mapping);
4692 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004694 else if (rep!=Py_None) {
4695 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 break;
4697 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004698 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 ++collendpos;
4700 }
4701 /* cache callback name lookup
4702 * (if not done yet, i.e. it's the first error) */
4703 if (*known_errorHandler==-1) {
4704 if ((errors==NULL) || (!strcmp(errors, "strict")))
4705 *known_errorHandler = 1;
4706 else if (!strcmp(errors, "replace"))
4707 *known_errorHandler = 2;
4708 else if (!strcmp(errors, "ignore"))
4709 *known_errorHandler = 3;
4710 else if (!strcmp(errors, "xmlcharrefreplace"))
4711 *known_errorHandler = 4;
4712 else
4713 *known_errorHandler = 0;
4714 }
4715 switch (*known_errorHandler) {
4716 case 1: /* strict */
4717 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4718 return -1;
4719 case 2: /* replace */
4720 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4721 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004722 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 return -1;
4724 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004725 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4727 return -1;
4728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 }
4730 /* fall through */
4731 case 3: /* ignore */
4732 *inpos = collendpos;
4733 break;
4734 case 4: /* xmlcharrefreplace */
4735 /* generate replacement (temporarily (mis)uses p) */
4736 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4737 char buffer[2+29+1+1];
4738 char *cp;
4739 sprintf(buffer, "&#%d;", (int)p[collpos]);
4740 for (cp = buffer; *cp; ++cp) {
4741 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004742 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004744 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4746 return -1;
4747 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 }
4749 }
4750 *inpos = collendpos;
4751 break;
4752 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004753 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 encoding, reason, p, size, exceptionObject,
4755 collstartpos, collendpos, &newpos);
4756 if (repunicode == NULL)
4757 return -1;
4758 /* generate replacement */
4759 repsize = PyUnicode_GET_SIZE(repunicode);
4760 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4761 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004762 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 return -1;
4764 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004765 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4768 return -1;
4769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 }
4771 *inpos = newpos;
4772 Py_DECREF(repunicode);
4773 }
4774 return 0;
4775}
4776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004778 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 PyObject *mapping,
4780 const char *errors)
4781{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 /* output object */
4783 PyObject *res = NULL;
4784 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 PyObject *errorHandler = NULL;
4789 PyObject *exc = NULL;
4790 /* the following variable is used for caching string comparisons
4791 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4792 * 3=ignore, 4=xmlcharrefreplace */
4793 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
4795 /* Default to Latin-1 */
4796 if (mapping == NULL)
4797 return PyUnicode_EncodeLatin1(p, size, errors);
4798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 /* allocate enough for a simple encoding without
4800 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004801 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 if (res == NULL)
4803 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004804 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 while (inpos<size) {
4808 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004809 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004810 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004812 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (charmap_encoding_error(p, size, &inpos, mapping,
4814 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004815 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004816 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004817 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 else
4821 /* done with this character => adjust input position */
4822 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004826 if (respos<PyBytes_GET_SIZE(res))
4827 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004828
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 Py_XDECREF(exc);
4830 Py_XDECREF(errorHandler);
4831 return res;
4832
4833 onError:
4834 Py_XDECREF(res);
4835 Py_XDECREF(exc);
4836 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 return NULL;
4838}
4839
4840PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4841 PyObject *mapping)
4842{
4843 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4844 PyErr_BadArgument();
4845 return NULL;
4846 }
4847 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4848 PyUnicode_GET_SIZE(unicode),
4849 mapping,
4850 NULL);
4851}
4852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853/* create or adjust a UnicodeTranslateError */
4854static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004855 const Py_UNICODE *unicode, Py_ssize_t size,
4856 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859 if (*exceptionObject == NULL) {
4860 *exceptionObject = PyUnicodeTranslateError_Create(
4861 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 }
4863 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4865 goto onError;
4866 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4867 goto onError;
4868 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4869 goto onError;
4870 return;
4871 onError:
4872 Py_DECREF(*exceptionObject);
4873 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 }
4875}
4876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877/* raises a UnicodeTranslateError */
4878static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004879 const Py_UNICODE *unicode, Py_ssize_t size,
4880 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004881 const char *reason)
4882{
4883 make_translate_exception(exceptionObject,
4884 unicode, size, startpos, endpos, reason);
4885 if (*exceptionObject != NULL)
4886 PyCodec_StrictErrors(*exceptionObject);
4887}
4888
4889/* error handling callback helper:
4890 build arguments, call the callback and check the arguments,
4891 put the result into newpos and return the replacement string, which
4892 has to be freed by the caller */
4893static PyObject *unicode_translate_call_errorhandler(const char *errors,
4894 PyObject **errorHandler,
4895 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4897 Py_ssize_t startpos, Py_ssize_t endpos,
4898 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004900 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004902 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 PyObject *restuple;
4904 PyObject *resunicode;
4905
4906 if (*errorHandler == NULL) {
4907 *errorHandler = PyCodec_LookupError(errors);
4908 if (*errorHandler == NULL)
4909 return NULL;
4910 }
4911
4912 make_translate_exception(exceptionObject,
4913 unicode, size, startpos, endpos, reason);
4914 if (*exceptionObject == NULL)
4915 return NULL;
4916
4917 restuple = PyObject_CallFunctionObjArgs(
4918 *errorHandler, *exceptionObject, NULL);
4919 if (restuple == NULL)
4920 return NULL;
4921 if (!PyTuple_Check(restuple)) {
4922 PyErr_Format(PyExc_TypeError, &argparse[4]);
4923 Py_DECREF(restuple);
4924 return NULL;
4925 }
4926 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 Py_DECREF(restuple);
4929 return NULL;
4930 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004931 if (i_newpos<0)
4932 *newpos = size+i_newpos;
4933 else
4934 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004935 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004936 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004937 Py_DECREF(restuple);
4938 return NULL;
4939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 Py_INCREF(resunicode);
4941 Py_DECREF(restuple);
4942 return resunicode;
4943}
4944
4945/* Lookup the character ch in the mapping and put the result in result,
4946 which must be decrefed by the caller.
4947 Return 0 on success, -1 on error */
4948static
4949int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4950{
Christian Heimes217cfd12007-12-02 14:31:20 +00004951 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 PyObject *x;
4953
4954 if (w == NULL)
4955 return -1;
4956 x = PyObject_GetItem(mapping, w);
4957 Py_DECREF(w);
4958 if (x == NULL) {
4959 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4960 /* No mapping found means: use 1:1 mapping. */
4961 PyErr_Clear();
4962 *result = NULL;
4963 return 0;
4964 } else
4965 return -1;
4966 }
4967 else if (x == Py_None) {
4968 *result = x;
4969 return 0;
4970 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004971 else if (PyLong_Check(x)) {
4972 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 long max = PyUnicode_GetMax();
4974 if (value < 0 || value > max) {
4975 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004976 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 Py_DECREF(x);
4978 return -1;
4979 }
4980 *result = x;
4981 return 0;
4982 }
4983 else if (PyUnicode_Check(x)) {
4984 *result = x;
4985 return 0;
4986 }
4987 else {
4988 /* wrong return value */
4989 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004990 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00004991 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 return -1;
4993 }
4994}
4995/* ensure that *outobj is at least requiredsize characters long,
4996if not reallocate and adjust various state variables.
4997Return 0 on success, -1 on error */
4998static
Walter Dörwald4894c302003-10-24 14:25:28 +00004999int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005002 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005003 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005005 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005007 if (requiredsize < 2 * oldsize)
5008 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005009 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 return -1;
5011 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 }
5013 return 0;
5014}
5015/* lookup the character, put the result in the output string and adjust
5016 various state variables. Return a new reference to the object that
5017 was put in the output buffer in *result, or Py_None, if the mapping was
5018 undefined (in which case no character was written).
5019 The called must decref result.
5020 Return 0 on success, -1 on error. */
5021static
Walter Dörwald4894c302003-10-24 14:25:28 +00005022int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005023 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005024 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025{
Walter Dörwald4894c302003-10-24 14:25:28 +00005026 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 return -1;
5028 if (*res==NULL) {
5029 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005030 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 }
5032 else if (*res==Py_None)
5033 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005034 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005036 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 }
5038 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005039 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 if (repsize==1) {
5041 /* no overflow check, because we know that the space is enough */
5042 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5043 }
5044 else if (repsize!=0) {
5045 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005046 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005047 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005048 repsize - 1;
5049 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 return -1;
5051 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5052 *outp += repsize;
5053 }
5054 }
5055 else
5056 return -1;
5057 return 0;
5058}
5059
5060PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 PyObject *mapping,
5063 const char *errors)
5064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 /* output object */
5066 PyObject *res = NULL;
5067 /* pointers to the beginning and end+1 of input */
5068 const Py_UNICODE *startp = p;
5069 const Py_UNICODE *endp = p + size;
5070 /* pointer into the output */
5071 Py_UNICODE *str;
5072 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005073 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 char *reason = "character maps to <undefined>";
5075 PyObject *errorHandler = NULL;
5076 PyObject *exc = NULL;
5077 /* the following variable is used for caching string comparisons
5078 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5079 * 3=ignore, 4=xmlcharrefreplace */
5080 int known_errorHandler = -1;
5081
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 if (mapping == NULL) {
5083 PyErr_BadArgument();
5084 return NULL;
5085 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086
5087 /* allocate enough for a simple 1:1 translation without
5088 replacements, if we need more, we'll resize */
5089 res = PyUnicode_FromUnicode(NULL, size);
5090 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005091 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 return res;
5094 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 while (p<endp) {
5097 /* try to encode it */
5098 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005099 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 goto onError;
5102 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005103 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 if (x!=Py_None) /* it worked => adjust input pointer */
5105 ++p;
5106 else { /* untranslatable character */
5107 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108 Py_ssize_t repsize;
5109 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 Py_UNICODE *uni2;
5111 /* startpos for collecting untranslatable chars */
5112 const Py_UNICODE *collstart = p;
5113 const Py_UNICODE *collend = p+1;
5114 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 /* find all untranslatable characters */
5117 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005118 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 goto onError;
5120 Py_XDECREF(x);
5121 if (x!=Py_None)
5122 break;
5123 ++collend;
5124 }
5125 /* cache callback name lookup
5126 * (if not done yet, i.e. it's the first error) */
5127 if (known_errorHandler==-1) {
5128 if ((errors==NULL) || (!strcmp(errors, "strict")))
5129 known_errorHandler = 1;
5130 else if (!strcmp(errors, "replace"))
5131 known_errorHandler = 2;
5132 else if (!strcmp(errors, "ignore"))
5133 known_errorHandler = 3;
5134 else if (!strcmp(errors, "xmlcharrefreplace"))
5135 known_errorHandler = 4;
5136 else
5137 known_errorHandler = 0;
5138 }
5139 switch (known_errorHandler) {
5140 case 1: /* strict */
5141 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5142 goto onError;
5143 case 2: /* replace */
5144 /* No need to check for space, this is a 1:1 replacement */
5145 for (coll = collstart; coll<collend; ++coll)
5146 *str++ = '?';
5147 /* fall through */
5148 case 3: /* ignore */
5149 p = collend;
5150 break;
5151 case 4: /* xmlcharrefreplace */
5152 /* generate replacement (temporarily (mis)uses p) */
5153 for (p = collstart; p < collend; ++p) {
5154 char buffer[2+29+1+1];
5155 char *cp;
5156 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005157 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5159 goto onError;
5160 for (cp = buffer; *cp; ++cp)
5161 *str++ = *cp;
5162 }
5163 p = collend;
5164 break;
5165 default:
5166 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5167 reason, startp, size, &exc,
5168 collstart-startp, collend-startp, &newpos);
5169 if (repunicode == NULL)
5170 goto onError;
5171 /* generate replacement */
5172 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005173 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5175 Py_DECREF(repunicode);
5176 goto onError;
5177 }
5178 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5179 *str++ = *uni2;
5180 p = startp + newpos;
5181 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 }
5183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 /* Resize if we allocated to much */
5186 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005187 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005188 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 }
5191 Py_XDECREF(exc);
5192 Py_XDECREF(errorHandler);
5193 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 onError:
5196 Py_XDECREF(res);
5197 Py_XDECREF(exc);
5198 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return NULL;
5200}
5201
5202PyObject *PyUnicode_Translate(PyObject *str,
5203 PyObject *mapping,
5204 const char *errors)
5205{
5206 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 str = PyUnicode_FromObject(str);
5209 if (str == NULL)
5210 goto onError;
5211 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5212 PyUnicode_GET_SIZE(str),
5213 mapping,
5214 errors);
5215 Py_DECREF(str);
5216 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 onError:
5219 Py_XDECREF(str);
5220 return NULL;
5221}
Tim Petersced69f82003-09-16 20:30:58 +00005222
Guido van Rossum9e896b32000-04-05 20:11:21 +00005223/* --- Decimal Encoder ---------------------------------------------------- */
5224
5225int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005226 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005227 char *output,
5228 const char *errors)
5229{
5230 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 PyObject *errorHandler = NULL;
5232 PyObject *exc = NULL;
5233 const char *encoding = "decimal";
5234 const char *reason = "invalid decimal Unicode string";
5235 /* the following variable is used for caching string comparisons
5236 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5237 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005238
5239 if (output == NULL) {
5240 PyErr_BadArgument();
5241 return -1;
5242 }
5243
5244 p = s;
5245 end = s + length;
5246 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005248 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 Py_ssize_t repsize;
5251 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 Py_UNICODE *uni2;
5253 Py_UNICODE *collstart;
5254 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005255
Guido van Rossum9e896b32000-04-05 20:11:21 +00005256 if (Py_UNICODE_ISSPACE(ch)) {
5257 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005258 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005259 continue;
5260 }
5261 decimal = Py_UNICODE_TODECIMAL(ch);
5262 if (decimal >= 0) {
5263 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005265 continue;
5266 }
Guido van Rossumba477042000-04-06 18:18:10 +00005267 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005268 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005270 continue;
5271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005272 /* All other characters are considered unencodable */
5273 collstart = p;
5274 collend = p+1;
5275 while (collend < end) {
5276 if ((0 < *collend && *collend < 256) ||
5277 !Py_UNICODE_ISSPACE(*collend) ||
5278 Py_UNICODE_TODECIMAL(*collend))
5279 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005280 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281 /* cache callback name lookup
5282 * (if not done yet, i.e. it's the first error) */
5283 if (known_errorHandler==-1) {
5284 if ((errors==NULL) || (!strcmp(errors, "strict")))
5285 known_errorHandler = 1;
5286 else if (!strcmp(errors, "replace"))
5287 known_errorHandler = 2;
5288 else if (!strcmp(errors, "ignore"))
5289 known_errorHandler = 3;
5290 else if (!strcmp(errors, "xmlcharrefreplace"))
5291 known_errorHandler = 4;
5292 else
5293 known_errorHandler = 0;
5294 }
5295 switch (known_errorHandler) {
5296 case 1: /* strict */
5297 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5298 goto onError;
5299 case 2: /* replace */
5300 for (p = collstart; p < collend; ++p)
5301 *output++ = '?';
5302 /* fall through */
5303 case 3: /* ignore */
5304 p = collend;
5305 break;
5306 case 4: /* xmlcharrefreplace */
5307 /* generate replacement (temporarily (mis)uses p) */
5308 for (p = collstart; p < collend; ++p)
5309 output += sprintf(output, "&#%d;", (int)*p);
5310 p = collend;
5311 break;
5312 default:
5313 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5314 encoding, reason, s, length, &exc,
5315 collstart-s, collend-s, &newpos);
5316 if (repunicode == NULL)
5317 goto onError;
5318 /* generate replacement */
5319 repsize = PyUnicode_GET_SIZE(repunicode);
5320 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5321 Py_UNICODE ch = *uni2;
5322 if (Py_UNICODE_ISSPACE(ch))
5323 *output++ = ' ';
5324 else {
5325 decimal = Py_UNICODE_TODECIMAL(ch);
5326 if (decimal >= 0)
5327 *output++ = '0' + decimal;
5328 else if (0 < ch && ch < 256)
5329 *output++ = (char)ch;
5330 else {
5331 Py_DECREF(repunicode);
5332 raise_encode_exception(&exc, encoding,
5333 s, length, collstart-s, collend-s, reason);
5334 goto onError;
5335 }
5336 }
5337 }
5338 p = s + newpos;
5339 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005340 }
5341 }
5342 /* 0-terminate the output string */
5343 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 Py_XDECREF(exc);
5345 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005346 return 0;
5347
5348 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 Py_XDECREF(exc);
5350 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005351 return -1;
5352}
5353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354/* --- Helpers ------------------------------------------------------------ */
5355
Eric Smith8c663262007-08-25 02:26:07 +00005356#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005357#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005358#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005359/* Include _ParseTupleFinds from find.h */
5360#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005361#include "stringlib/find.h"
5362#include "stringlib/partition.h"
5363
Eric Smith5807c412008-05-11 21:00:57 +00005364#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5365#include "stringlib/localeutil.h"
5366
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367/* helper macro to fixup start/end slice values */
5368#define FIX_START_END(obj) \
5369 if (start < 0) \
5370 start += (obj)->length; \
5371 if (start < 0) \
5372 start = 0; \
5373 if (end > (obj)->length) \
5374 end = (obj)->length; \
5375 if (end < 0) \
5376 end += (obj)->length; \
5377 if (end < 0) \
5378 end = 0;
5379
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005381 PyObject *substr,
5382 Py_ssize_t start,
5383 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005386 PyUnicodeObject* str_obj;
5387 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Thomas Wouters477c8d52006-05-27 19:21:47 +00005389 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5390 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005392 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5393 if (!sub_obj) {
5394 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 return -1;
5396 }
Tim Petersced69f82003-09-16 20:30:58 +00005397
Thomas Wouters477c8d52006-05-27 19:21:47 +00005398 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005399
Thomas Wouters477c8d52006-05-27 19:21:47 +00005400 result = stringlib_count(
5401 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5402 );
5403
5404 Py_DECREF(sub_obj);
5405 Py_DECREF(str_obj);
5406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 return result;
5408}
5409
Martin v. Löwis18e16552006-02-15 17:27:45 +00005410Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005411 PyObject *sub,
5412 Py_ssize_t start,
5413 Py_ssize_t end,
5414 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005420 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005421 sub = PyUnicode_FromObject(sub);
5422 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005423 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005424 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 }
Tim Petersced69f82003-09-16 20:30:58 +00005426
Thomas Wouters477c8d52006-05-27 19:21:47 +00005427 if (direction > 0)
5428 result = stringlib_find_slice(
5429 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431 start, end
5432 );
5433 else
5434 result = stringlib_rfind_slice(
5435 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5436 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5437 start, end
5438 );
5439
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005441 Py_DECREF(sub);
5442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 return result;
5444}
5445
Tim Petersced69f82003-09-16 20:30:58 +00005446static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447int tailmatch(PyUnicodeObject *self,
5448 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005449 Py_ssize_t start,
5450 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 int direction)
5452{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 if (substring->length == 0)
5454 return 1;
5455
Thomas Wouters477c8d52006-05-27 19:21:47 +00005456 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
5458 end -= substring->length;
5459 if (end < start)
5460 return 0;
5461
5462 if (direction > 0) {
5463 if (Py_UNICODE_MATCH(self, end, substring))
5464 return 1;
5465 } else {
5466 if (Py_UNICODE_MATCH(self, start, substring))
5467 return 1;
5468 }
5469
5470 return 0;
5471}
5472
Martin v. Löwis18e16552006-02-15 17:27:45 +00005473Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005475 Py_ssize_t start,
5476 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 int direction)
5478{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 str = PyUnicode_FromObject(str);
5482 if (str == NULL)
5483 return -1;
5484 substr = PyUnicode_FromObject(substr);
5485 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005486 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 return -1;
5488 }
Tim Petersced69f82003-09-16 20:30:58 +00005489
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 result = tailmatch((PyUnicodeObject *)str,
5491 (PyUnicodeObject *)substr,
5492 start, end, direction);
5493 Py_DECREF(str);
5494 Py_DECREF(substr);
5495 return result;
5496}
5497
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498/* Apply fixfct filter to the Unicode object self and return a
5499 reference to the modified object */
5500
Tim Petersced69f82003-09-16 20:30:58 +00005501static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502PyObject *fixup(PyUnicodeObject *self,
5503 int (*fixfct)(PyUnicodeObject *s))
5504{
5505
5506 PyUnicodeObject *u;
5507
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005508 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 if (u == NULL)
5510 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005511
5512 Py_UNICODE_COPY(u->str, self->str, self->length);
5513
Tim Peters7a29bd52001-09-12 03:03:31 +00005514 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 /* fixfct should return TRUE if it modified the buffer. If
5516 FALSE, return a reference to the original buffer instead
5517 (to save space, not time) */
5518 Py_INCREF(self);
5519 Py_DECREF(u);
5520 return (PyObject*) self;
5521 }
5522 return (PyObject*) u;
5523}
5524
Tim Petersced69f82003-09-16 20:30:58 +00005525static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526int fixupper(PyUnicodeObject *self)
5527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005528 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 Py_UNICODE *s = self->str;
5530 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 while (len-- > 0) {
5533 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005534
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 ch = Py_UNICODE_TOUPPER(*s);
5536 if (ch != *s) {
5537 status = 1;
5538 *s = ch;
5539 }
5540 s++;
5541 }
5542
5543 return status;
5544}
5545
Tim Petersced69f82003-09-16 20:30:58 +00005546static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547int fixlower(PyUnicodeObject *self)
5548{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005549 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 Py_UNICODE *s = self->str;
5551 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 while (len-- > 0) {
5554 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 ch = Py_UNICODE_TOLOWER(*s);
5557 if (ch != *s) {
5558 status = 1;
5559 *s = ch;
5560 }
5561 s++;
5562 }
5563
5564 return status;
5565}
5566
Tim Petersced69f82003-09-16 20:30:58 +00005567static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568int fixswapcase(PyUnicodeObject *self)
5569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005570 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_UNICODE *s = self->str;
5572 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 while (len-- > 0) {
5575 if (Py_UNICODE_ISUPPER(*s)) {
5576 *s = Py_UNICODE_TOLOWER(*s);
5577 status = 1;
5578 } else if (Py_UNICODE_ISLOWER(*s)) {
5579 *s = Py_UNICODE_TOUPPER(*s);
5580 status = 1;
5581 }
5582 s++;
5583 }
5584
5585 return status;
5586}
5587
Tim Petersced69f82003-09-16 20:30:58 +00005588static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589int fixcapitalize(PyUnicodeObject *self)
5590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005592 Py_UNICODE *s = self->str;
5593 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005594
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005595 if (len == 0)
5596 return 0;
5597 if (Py_UNICODE_ISLOWER(*s)) {
5598 *s = Py_UNICODE_TOUPPER(*s);
5599 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005601 s++;
5602 while (--len > 0) {
5603 if (Py_UNICODE_ISUPPER(*s)) {
5604 *s = Py_UNICODE_TOLOWER(*s);
5605 status = 1;
5606 }
5607 s++;
5608 }
5609 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610}
5611
5612static
5613int fixtitle(PyUnicodeObject *self)
5614{
5615 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5616 register Py_UNICODE *e;
5617 int previous_is_cased;
5618
5619 /* Shortcut for single character strings */
5620 if (PyUnicode_GET_SIZE(self) == 1) {
5621 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5622 if (*p != ch) {
5623 *p = ch;
5624 return 1;
5625 }
5626 else
5627 return 0;
5628 }
Tim Petersced69f82003-09-16 20:30:58 +00005629
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 e = p + PyUnicode_GET_SIZE(self);
5631 previous_is_cased = 0;
5632 for (; p < e; p++) {
5633 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 if (previous_is_cased)
5636 *p = Py_UNICODE_TOLOWER(ch);
5637 else
5638 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005639
5640 if (Py_UNICODE_ISLOWER(ch) ||
5641 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 Py_UNICODE_ISTITLE(ch))
5643 previous_is_cased = 1;
5644 else
5645 previous_is_cased = 0;
5646 }
5647 return 1;
5648}
5649
Tim Peters8ce9f162004-08-27 01:49:32 +00005650PyObject *
5651PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652{
Skip Montanaro6543b452004-09-16 03:28:13 +00005653 const Py_UNICODE blank = ' ';
5654 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005656 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005657 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5658 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005659 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5660 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005661 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005662 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Tim Peters05eba1f2004-08-27 21:32:02 +00005664 fseq = PySequence_Fast(seq, "");
5665 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005667 }
5668
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005669 /* NOTE: the following code can't call back into Python code,
5670 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005671 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005672
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 seqlen = PySequence_Fast_GET_SIZE(fseq);
5674 /* If empty sequence, return u"". */
5675 if (seqlen == 0) {
5676 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5677 goto Done;
5678 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005679 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005680 /* If singleton sequence with an exact Unicode, return that. */
5681 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005682 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 if (PyUnicode_CheckExact(item)) {
5684 Py_INCREF(item);
5685 res = (PyUnicodeObject *)item;
5686 goto Done;
5687 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005688 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005689 else {
5690 /* Set up sep and seplen */
5691 if (separator == NULL) {
5692 sep = &blank;
5693 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005694 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005695 else {
5696 if (!PyUnicode_Check(separator)) {
5697 PyErr_Format(PyExc_TypeError,
5698 "separator: expected str instance,"
5699 " %.80s found",
5700 Py_TYPE(separator)->tp_name);
5701 goto onError;
5702 }
5703 sep = PyUnicode_AS_UNICODE(separator);
5704 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005705 }
5706 }
5707
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005708 /* There are at least two things to join, or else we have a subclass
5709 * of str in the sequence.
5710 * Do a pre-pass to figure out the total amount of space we'll
5711 * need (sz), and see whether all argument are strings.
5712 */
5713 sz = 0;
5714 for (i = 0; i < seqlen; i++) {
5715 const Py_ssize_t old_sz = sz;
5716 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005717 if (!PyUnicode_Check(item)) {
5718 PyErr_Format(PyExc_TypeError,
5719 "sequence item %zd: expected str instance,"
5720 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005721 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005722 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005723 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005724 sz += PyUnicode_GET_SIZE(item);
5725 if (i != 0)
5726 sz += seplen;
5727 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5728 PyErr_SetString(PyExc_OverflowError,
5729 "join() result is too long for a Python string");
5730 goto onError;
5731 }
5732 }
Tim Petersced69f82003-09-16 20:30:58 +00005733
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005734 res = _PyUnicode_New(sz);
5735 if (res == NULL)
5736 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005737
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005738 /* Catenate everything. */
5739 res_p = PyUnicode_AS_UNICODE(res);
5740 for (i = 0; i < seqlen; ++i) {
5741 Py_ssize_t itemlen;
5742 item = items[i];
5743 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005744 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005745 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005746 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005747 res_p += seplen;
5748 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005749 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5750 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005751 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005752
Tim Peters8ce9f162004-08-27 01:49:32 +00005753 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005754 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 return (PyObject *)res;
5756
5757 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005758 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005759 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
5761}
5762
Tim Petersced69f82003-09-16 20:30:58 +00005763static
5764PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005765 Py_ssize_t left,
5766 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 Py_UNICODE fill)
5768{
5769 PyUnicodeObject *u;
5770
5771 if (left < 0)
5772 left = 0;
5773 if (right < 0)
5774 right = 0;
5775
Tim Peters7a29bd52001-09-12 03:03:31 +00005776 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 Py_INCREF(self);
5778 return self;
5779 }
5780
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005781 if (left > PY_SSIZE_T_MAX - self->length ||
5782 right > PY_SSIZE_T_MAX - (left + self->length)) {
5783 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5784 return NULL;
5785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 u = _PyUnicode_New(left + self->length + right);
5787 if (u) {
5788 if (left)
5789 Py_UNICODE_FILL(u->str, fill, left);
5790 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5791 if (right)
5792 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5793 }
5794
5795 return u;
5796}
5797
5798#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 if (!str) \
5801 goto onError; \
5802 if (PyList_Append(list, str)) { \
5803 Py_DECREF(str); \
5804 goto onError; \
5805 } \
5806 else \
5807 Py_DECREF(str);
5808
5809static
5810PyObject *split_whitespace(PyUnicodeObject *self,
5811 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 register Py_ssize_t i;
5815 register Py_ssize_t j;
5816 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005818 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820 for (i = j = 0; i < len; ) {
5821 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005822 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 i++;
5824 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005825 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 i++;
5827 if (j < i) {
5828 if (maxcount-- <= 0)
5829 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005830 SPLIT_APPEND(buf, j, i);
5831 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 i++;
5833 j = i;
5834 }
5835 }
5836 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005837 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 }
5839 return list;
5840
5841 onError:
5842 Py_DECREF(list);
5843 return NULL;
5844}
5845
5846PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005847 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 register Py_ssize_t i;
5850 register Py_ssize_t j;
5851 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 PyObject *list;
5853 PyObject *str;
5854 Py_UNICODE *data;
5855
5856 string = PyUnicode_FromObject(string);
5857 if (string == NULL)
5858 return NULL;
5859 data = PyUnicode_AS_UNICODE(string);
5860 len = PyUnicode_GET_SIZE(string);
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 list = PyList_New(0);
5863 if (!list)
5864 goto onError;
5865
5866 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
5873 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005874 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 if (i < len) {
5876 if (data[i] == '\r' && i + 1 < len &&
5877 data[i+1] == '\n')
5878 i += 2;
5879 else
5880 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005881 if (keepends)
5882 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 }
Guido van Rossum86662912000-04-11 15:38:46 +00005884 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 j = i;
5886 }
5887 if (j < len) {
5888 SPLIT_APPEND(data, j, len);
5889 }
5890
5891 Py_DECREF(string);
5892 return list;
5893
5894 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005895 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 Py_DECREF(string);
5897 return NULL;
5898}
5899
Tim Petersced69f82003-09-16 20:30:58 +00005900static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901PyObject *split_char(PyUnicodeObject *self,
5902 PyObject *list,
5903 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005904 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 register Py_ssize_t i;
5907 register Py_ssize_t j;
5908 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005910 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
5912 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005913 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (maxcount-- <= 0)
5915 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005916 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 i = j = i + 1;
5918 } else
5919 i++;
5920 }
5921 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005922 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
5924 return list;
5925
5926 onError:
5927 Py_DECREF(list);
5928 return NULL;
5929}
5930
Tim Petersced69f82003-09-16 20:30:58 +00005931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932PyObject *split_substring(PyUnicodeObject *self,
5933 PyObject *list,
5934 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005935 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005937 register Py_ssize_t i;
5938 register Py_ssize_t j;
5939 Py_ssize_t len = self->length;
5940 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 PyObject *str;
5942
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005943 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (Py_UNICODE_MATCH(self, i, substring)) {
5945 if (maxcount-- <= 0)
5946 break;
5947 SPLIT_APPEND(self->str, j, i);
5948 i = j = i + sublen;
5949 } else
5950 i++;
5951 }
5952 if (j <= len) {
5953 SPLIT_APPEND(self->str, j, len);
5954 }
5955 return list;
5956
5957 onError:
5958 Py_DECREF(list);
5959 return NULL;
5960}
5961
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005962static
5963PyObject *rsplit_whitespace(PyUnicodeObject *self,
5964 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005966{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 register Py_ssize_t i;
5968 register Py_ssize_t j;
5969 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005971 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005972
5973 for (i = j = len - 1; i >= 0; ) {
5974 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005975 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976 i--;
5977 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005978 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005979 i--;
5980 if (j > i) {
5981 if (maxcount-- <= 0)
5982 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005983 SPLIT_APPEND(buf, i + 1, j + 1);
5984 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985 i--;
5986 j = i;
5987 }
5988 }
5989 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005990 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 if (PyList_Reverse(list) < 0)
5993 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994 return list;
5995
5996 onError:
5997 Py_DECREF(list);
5998 return NULL;
5999}
6000
6001static
6002PyObject *rsplit_char(PyUnicodeObject *self,
6003 PyObject *list,
6004 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 register Py_ssize_t i;
6008 register Py_ssize_t j;
6009 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006011 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012
6013 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006014 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006015 if (maxcount-- <= 0)
6016 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006017 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006018 j = i = i - 1;
6019 } else
6020 i--;
6021 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006022 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006023 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006024 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006025 if (PyList_Reverse(list) < 0)
6026 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006027 return list;
6028
6029 onError:
6030 Py_DECREF(list);
6031 return NULL;
6032}
6033
6034static
6035PyObject *rsplit_substring(PyUnicodeObject *self,
6036 PyObject *list,
6037 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006039{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006040 register Py_ssize_t i;
6041 register Py_ssize_t j;
6042 Py_ssize_t len = self->length;
6043 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006044 PyObject *str;
6045
6046 for (i = len - sublen, j = len; i >= 0; ) {
6047 if (Py_UNICODE_MATCH(self, i, substring)) {
6048 if (maxcount-- <= 0)
6049 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006050 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006051 j = i;
6052 i -= sublen;
6053 } else
6054 i--;
6055 }
6056 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006057 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006058 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006059 if (PyList_Reverse(list) < 0)
6060 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006061 return list;
6062
6063 onError:
6064 Py_DECREF(list);
6065 return NULL;
6066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068#undef SPLIT_APPEND
6069
6070static
6071PyObject *split(PyUnicodeObject *self,
6072 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006073 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
6075 PyObject *list;
6076
6077 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006078 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
6080 list = PyList_New(0);
6081 if (!list)
6082 return NULL;
6083
6084 if (substring == NULL)
6085 return split_whitespace(self,list,maxcount);
6086
6087 else if (substring->length == 1)
6088 return split_char(self,list,substring->str[0],maxcount);
6089
6090 else if (substring->length == 0) {
6091 Py_DECREF(list);
6092 PyErr_SetString(PyExc_ValueError, "empty separator");
6093 return NULL;
6094 }
6095 else
6096 return split_substring(self,list,substring,maxcount);
6097}
6098
Tim Petersced69f82003-09-16 20:30:58 +00006099static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006100PyObject *rsplit(PyUnicodeObject *self,
6101 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006103{
6104 PyObject *list;
6105
6106 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006107 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006108
6109 list = PyList_New(0);
6110 if (!list)
6111 return NULL;
6112
6113 if (substring == NULL)
6114 return rsplit_whitespace(self,list,maxcount);
6115
6116 else if (substring->length == 1)
6117 return rsplit_char(self,list,substring->str[0],maxcount);
6118
6119 else if (substring->length == 0) {
6120 Py_DECREF(list);
6121 PyErr_SetString(PyExc_ValueError, "empty separator");
6122 return NULL;
6123 }
6124 else
6125 return rsplit_substring(self,list,substring,maxcount);
6126}
6127
6128static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129PyObject *replace(PyUnicodeObject *self,
6130 PyUnicodeObject *str1,
6131 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133{
6134 PyUnicodeObject *u;
6135
6136 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006137 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Thomas Wouters477c8d52006-05-27 19:21:47 +00006139 if (str1->length == str2->length) {
6140 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006141 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006142 if (str1->length == 1) {
6143 /* replace characters */
6144 Py_UNICODE u1, u2;
6145 if (!findchar(self->str, self->length, str1->str[0]))
6146 goto nothing;
6147 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6148 if (!u)
6149 return NULL;
6150 Py_UNICODE_COPY(u->str, self->str, self->length);
6151 u1 = str1->str[0];
6152 u2 = str2->str[0];
6153 for (i = 0; i < u->length; i++)
6154 if (u->str[i] == u1) {
6155 if (--maxcount < 0)
6156 break;
6157 u->str[i] = u2;
6158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006160 i = fastsearch(
6161 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 if (i < 0)
6164 goto nothing;
6165 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6166 if (!u)
6167 return NULL;
6168 Py_UNICODE_COPY(u->str, self->str, self->length);
6169 while (i <= self->length - str1->length)
6170 if (Py_UNICODE_MATCH(self, i, str1)) {
6171 if (--maxcount < 0)
6172 break;
6173 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6174 i += str1->length;
6175 } else
6176 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179
6180 Py_ssize_t n, i, j, e;
6181 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 Py_UNICODE *p;
6183
6184 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006185 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 if (n > maxcount)
6187 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006188 if (n == 0)
6189 goto nothing;
6190 /* new_size = self->length + n * (str2->length - str1->length)); */
6191 delta = (str2->length - str1->length);
6192 if (delta == 0) {
6193 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006195 product = n * (str2->length - str1->length);
6196 if ((product / (str2->length - str1->length)) != n) {
6197 PyErr_SetString(PyExc_OverflowError,
6198 "replace string is too long");
6199 return NULL;
6200 }
6201 new_size = self->length + product;
6202 if (new_size < 0) {
6203 PyErr_SetString(PyExc_OverflowError,
6204 "replace string is too long");
6205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
6207 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006208 u = _PyUnicode_New(new_size);
6209 if (!u)
6210 return NULL;
6211 i = 0;
6212 p = u->str;
6213 e = self->length - str1->length;
6214 if (str1->length > 0) {
6215 while (n-- > 0) {
6216 /* look for next match */
6217 j = i;
6218 while (j <= e) {
6219 if (Py_UNICODE_MATCH(self, j, str1))
6220 break;
6221 j++;
6222 }
6223 if (j > i) {
6224 if (j > e)
6225 break;
6226 /* copy unchanged part [i:j] */
6227 Py_UNICODE_COPY(p, self->str+i, j-i);
6228 p += j - i;
6229 }
6230 /* copy substitution string */
6231 if (str2->length > 0) {
6232 Py_UNICODE_COPY(p, str2->str, str2->length);
6233 p += str2->length;
6234 }
6235 i = j + str1->length;
6236 }
6237 if (i < self->length)
6238 /* copy tail [i:] */
6239 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6240 } else {
6241 /* interleave */
6242 while (n > 0) {
6243 Py_UNICODE_COPY(p, str2->str, str2->length);
6244 p += str2->length;
6245 if (--n <= 0)
6246 break;
6247 *p++ = self->str[i++];
6248 }
6249 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253
6254nothing:
6255 /* nothing to replace; return original string (when possible) */
6256 if (PyUnicode_CheckExact(self)) {
6257 Py_INCREF(self);
6258 return (PyObject *) self;
6259 }
6260 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261}
6262
6263/* --- Unicode Object Methods --------------------------------------------- */
6264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006266"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267\n\
6268Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006269characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
6271static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006272unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return fixup(self, fixtitle);
6275}
6276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006277PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006278"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279\n\
6280Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006281have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006284unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return fixup(self, fixcapitalize);
6287}
6288
6289#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006290PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006291"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292\n\
6293Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006297unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
6299 PyObject *list;
6300 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006301 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 /* Split into words */
6304 list = split(self, NULL, -1);
6305 if (!list)
6306 return NULL;
6307
6308 /* Capitalize each word */
6309 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6310 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6311 fixcapitalize);
6312 if (item == NULL)
6313 goto onError;
6314 Py_DECREF(PyList_GET_ITEM(list, i));
6315 PyList_SET_ITEM(list, i, item);
6316 }
6317
6318 /* Join the words to form a new string */
6319 item = PyUnicode_Join(NULL, list);
6320
6321onError:
6322 Py_DECREF(list);
6323 return (PyObject *)item;
6324}
6325#endif
6326
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006327/* Argument converter. Coerces to a single unicode character */
6328
6329static int
6330convert_uc(PyObject *obj, void *addr)
6331{
6332 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6333 PyObject *uniobj;
6334 Py_UNICODE *unistr;
6335
6336 uniobj = PyUnicode_FromObject(obj);
6337 if (uniobj == NULL) {
6338 PyErr_SetString(PyExc_TypeError,
6339 "The fill character cannot be converted to Unicode");
6340 return 0;
6341 }
6342 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6343 PyErr_SetString(PyExc_TypeError,
6344 "The fill character must be exactly one character long");
6345 Py_DECREF(uniobj);
6346 return 0;
6347 }
6348 unistr = PyUnicode_AS_UNICODE(uniobj);
6349 *fillcharloc = unistr[0];
6350 Py_DECREF(uniobj);
6351 return 1;
6352}
6353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006354PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006355"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006357Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006358done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359
6360static PyObject *
6361unicode_center(PyUnicodeObject *self, PyObject *args)
6362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 Py_ssize_t marg, left;
6364 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006365 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
Thomas Woutersde017742006-02-16 19:34:37 +00006367 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
6369
Tim Peters7a29bd52001-09-12 03:03:31 +00006370 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 Py_INCREF(self);
6372 return (PyObject*) self;
6373 }
6374
6375 marg = width - self->length;
6376 left = marg / 2 + (marg & width & 1);
6377
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006378 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379}
6380
Marc-André Lemburge5034372000-08-08 08:04:29 +00006381#if 0
6382
6383/* This code should go into some future Unicode collation support
6384 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006385 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006386
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006387/* speedy UTF-16 code point order comparison */
6388/* gleaned from: */
6389/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6390
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006391static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006392{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006393 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006394 0, 0, 0, 0, 0, 0, 0, 0,
6395 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006396 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006397};
6398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399static int
6400unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6401{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 Py_UNICODE *s1 = str1->str;
6405 Py_UNICODE *s2 = str2->str;
6406
6407 len1 = str1->length;
6408 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006411 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006412
6413 c1 = *s1++;
6414 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006415
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006416 if (c1 > (1<<11) * 26)
6417 c1 += utf16Fixup[c1>>11];
6418 if (c2 > (1<<11) * 26)
6419 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006420 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006421
6422 if (c1 != c2)
6423 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006424
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006425 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
6427
6428 return (len1 < len2) ? -1 : (len1 != len2);
6429}
6430
Marc-André Lemburge5034372000-08-08 08:04:29 +00006431#else
6432
6433static int
6434unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6435{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006436 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006437
6438 Py_UNICODE *s1 = str1->str;
6439 Py_UNICODE *s2 = str2->str;
6440
6441 len1 = str1->length;
6442 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006443
Marc-André Lemburge5034372000-08-08 08:04:29 +00006444 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006445 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006446
Fredrik Lundh45714e92001-06-26 16:39:36 +00006447 c1 = *s1++;
6448 c2 = *s2++;
6449
6450 if (c1 != c2)
6451 return (c1 < c2) ? -1 : 1;
6452
Marc-André Lemburge5034372000-08-08 08:04:29 +00006453 len1--; len2--;
6454 }
6455
6456 return (len1 < len2) ? -1 : (len1 != len2);
6457}
6458
6459#endif
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461int PyUnicode_Compare(PyObject *left,
6462 PyObject *right)
6463{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006464 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6465 return unicode_compare((PyUnicodeObject *)left,
6466 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006467 PyErr_Format(PyExc_TypeError,
6468 "Can't compare %.100s and %.100s",
6469 left->ob_type->tp_name,
6470 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return -1;
6472}
6473
Martin v. Löwis5b222132007-06-10 09:51:05 +00006474int
6475PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6476{
6477 int i;
6478 Py_UNICODE *id;
6479 assert(PyUnicode_Check(uni));
6480 id = PyUnicode_AS_UNICODE(uni);
6481 /* Compare Unicode string and source character set string */
6482 for (i = 0; id[i] && str[i]; i++)
6483 if (id[i] != str[i])
6484 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6485 if (id[i])
6486 return 1; /* uni is longer */
6487 if (str[i])
6488 return -1; /* str is longer */
6489 return 0;
6490}
6491
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006492PyObject *PyUnicode_RichCompare(PyObject *left,
6493 PyObject *right,
6494 int op)
6495{
6496 int result;
6497
6498 result = PyUnicode_Compare(left, right);
6499 if (result == -1 && PyErr_Occurred())
6500 goto onError;
6501
6502 /* Convert the return value to a Boolean */
6503 switch (op) {
6504 case Py_EQ:
6505 result = (result == 0);
6506 break;
6507 case Py_NE:
6508 result = (result != 0);
6509 break;
6510 case Py_LE:
6511 result = (result <= 0);
6512 break;
6513 case Py_GE:
6514 result = (result >= 0);
6515 break;
6516 case Py_LT:
6517 result = (result == -1);
6518 break;
6519 case Py_GT:
6520 result = (result == 1);
6521 break;
6522 }
6523 return PyBool_FromLong(result);
6524
6525 onError:
6526
6527 /* Standard case
6528
6529 Type errors mean that PyUnicode_FromObject() could not convert
6530 one of the arguments (usually the right hand side) to Unicode,
6531 ie. we can't handle the comparison request. However, it is
6532 possible that the other object knows a comparison method, which
6533 is why we return Py_NotImplemented to give the other object a
6534 chance.
6535
6536 */
6537 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6538 PyErr_Clear();
6539 Py_INCREF(Py_NotImplemented);
6540 return Py_NotImplemented;
6541 }
6542 if (op != Py_EQ && op != Py_NE)
6543 return NULL;
6544
6545 /* Equality comparison.
6546
6547 This is a special case: we silence any PyExc_UnicodeDecodeError
6548 and instead turn it into a PyErr_UnicodeWarning.
6549
6550 */
6551 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6552 return NULL;
6553 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006554 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6555 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006556 "equal comparison "
6557 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006558 "interpreting them as being unequal"
6559 :
6560 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006561 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006562 "interpreting them as being unequal",
6563 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006564 return NULL;
6565 result = (op == Py_NE);
6566 return PyBool_FromLong(result);
6567}
6568
Guido van Rossum403d68b2000-03-13 15:55:09 +00006569int PyUnicode_Contains(PyObject *container,
6570 PyObject *element)
6571{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006574
6575 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006576 sub = PyUnicode_FromObject(element);
6577 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006578 PyErr_Format(PyExc_TypeError,
6579 "'in <string>' requires string as left operand, not %s",
6580 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006582 }
6583
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 str = PyUnicode_FromObject(container);
6585 if (!str) {
6586 Py_DECREF(sub);
6587 return -1;
6588 }
6589
6590 result = stringlib_contains_obj(str, sub);
6591
6592 Py_DECREF(str);
6593 Py_DECREF(sub);
6594
Guido van Rossum403d68b2000-03-13 15:55:09 +00006595 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006596}
6597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598/* Concat to string or Unicode object giving a new Unicode object. */
6599
6600PyObject *PyUnicode_Concat(PyObject *left,
6601 PyObject *right)
6602{
6603 PyUnicodeObject *u = NULL, *v = NULL, *w;
6604
6605 /* Coerce the two arguments */
6606 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6607 if (u == NULL)
6608 goto onError;
6609 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6610 if (v == NULL)
6611 goto onError;
6612
6613 /* Shortcuts */
6614 if (v == unicode_empty) {
6615 Py_DECREF(v);
6616 return (PyObject *)u;
6617 }
6618 if (u == unicode_empty) {
6619 Py_DECREF(u);
6620 return (PyObject *)v;
6621 }
6622
6623 /* Concat the two Unicode strings */
6624 w = _PyUnicode_New(u->length + v->length);
6625 if (w == NULL)
6626 goto onError;
6627 Py_UNICODE_COPY(w->str, u->str, u->length);
6628 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6629
6630 Py_DECREF(u);
6631 Py_DECREF(v);
6632 return (PyObject *)w;
6633
6634onError:
6635 Py_XDECREF(u);
6636 Py_XDECREF(v);
6637 return NULL;
6638}
6639
Walter Dörwald1ab83302007-05-18 17:15:44 +00006640void
6641PyUnicode_Append(PyObject **pleft, PyObject *right)
6642{
6643 PyObject *new;
6644 if (*pleft == NULL)
6645 return;
6646 if (right == NULL || !PyUnicode_Check(*pleft)) {
6647 Py_DECREF(*pleft);
6648 *pleft = NULL;
6649 return;
6650 }
6651 new = PyUnicode_Concat(*pleft, right);
6652 Py_DECREF(*pleft);
6653 *pleft = new;
6654}
6655
6656void
6657PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6658{
6659 PyUnicode_Append(pleft, right);
6660 Py_XDECREF(right);
6661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664"S.count(sub[, start[, end]]) -> int\n\
6665\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006666Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006667string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject *
6671unicode_count(PyUnicodeObject *self, PyObject *args)
6672{
6673 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006675 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 PyObject *result;
6677
Guido van Rossumb8872e62000-05-09 14:14:27 +00006678 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6679 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
6681
6682 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 if (substring == NULL)
6685 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006686
Thomas Wouters477c8d52006-05-27 19:21:47 +00006687 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Christian Heimes217cfd12007-12-02 14:31:20 +00006689 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006690 stringlib_count(self->str + start, end - start,
6691 substring->str, substring->length)
6692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return result;
6697}
6698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006699PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006700"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006702Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006703to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006704handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6706'xmlcharrefreplace' as well as any other name registered with\n\
6707codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
6709static PyObject *
6710unicode_encode(PyUnicodeObject *self, PyObject *args)
6711{
6712 char *encoding = NULL;
6713 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006714 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6717 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006718 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006719 if (v == NULL)
6720 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006721 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006722 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006723 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006724 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006725 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006726 Py_DECREF(v);
6727 return NULL;
6728 }
6729 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006730
6731 onError:
6732 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006733}
6734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006736"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737\n\
6738Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006739If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741static PyObject*
6742unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6743{
6744 Py_UNICODE *e;
6745 Py_UNICODE *p;
6746 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006747 Py_UNICODE *qe;
6748 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 PyUnicodeObject *u;
6750 int tabsize = 8;
6751
6752 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6753 return NULL;
6754
Thomas Wouters7e474022000-07-16 12:04:32 +00006755 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006756 i = 0; /* chars up to and including most recent \n or \r */
6757 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6758 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 for (p = self->str; p < e; p++)
6760 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006761 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006762 incr = tabsize - (j % tabsize); /* cannot overflow */
6763 if (j > PY_SSIZE_T_MAX - incr)
6764 goto overflow1;
6765 j += incr;
6766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
6768 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006769 if (j > PY_SSIZE_T_MAX - 1)
6770 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 j++;
6772 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006773 if (i > PY_SSIZE_T_MAX - j)
6774 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006776 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 }
6778 }
6779
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006780 if (i > PY_SSIZE_T_MAX - j)
6781 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 /* Second pass: create output string and fill it */
6784 u = _PyUnicode_New(i + j);
6785 if (!u)
6786 return NULL;
6787
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006788 j = 0; /* same as in first pass */
6789 q = u->str; /* next output char */
6790 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
6792 for (p = self->str; p < e; p++)
6793 if (*p == '\t') {
6794 if (tabsize > 0) {
6795 i = tabsize - (j % tabsize);
6796 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006797 while (i--) {
6798 if (q >= qe)
6799 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 }
6803 }
6804 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006805 if (q >= qe)
6806 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006808 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 if (*p == '\n' || *p == '\r')
6810 j = 0;
6811 }
6812
6813 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006814
6815 overflow2:
6816 Py_DECREF(u);
6817 overflow1:
6818 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006823"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824\n\
6825Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006826such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827arguments start and end are interpreted as in slice notation.\n\
6828\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject *
6832unicode_find(PyUnicodeObject *self, PyObject *args)
6833{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006834 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006835 Py_ssize_t start;
6836 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006837 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
Christian Heimes9cd17752007-11-18 19:35:23 +00006839 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Thomas Wouters477c8d52006-05-27 19:21:47 +00006842 result = stringlib_find_slice(
6843 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6844 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6845 start, end
6846 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006849
Christian Heimes217cfd12007-12-02 14:31:20 +00006850 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851}
6852
6853static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
6856 if (index < 0 || index >= self->length) {
6857 PyErr_SetString(PyExc_IndexError, "string index out of range");
6858 return NULL;
6859 }
6860
6861 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6862}
6863
Guido van Rossumc2504932007-09-18 19:42:40 +00006864/* Believe it or not, this produces the same value for ASCII strings
6865 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006867unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Guido van Rossumc2504932007-09-18 19:42:40 +00006869 Py_ssize_t len;
6870 Py_UNICODE *p;
6871 long x;
6872
6873 if (self->hash != -1)
6874 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006875 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006876 p = self->str;
6877 x = *p << 7;
6878 while (--len >= 0)
6879 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006880 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006881 if (x == -1)
6882 x = -2;
6883 self->hash = x;
6884 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006888"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject *
6893unicode_index(PyUnicodeObject *self, PyObject *args)
6894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006896 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006897 Py_ssize_t start;
6898 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
Christian Heimes9cd17752007-11-18 19:35:23 +00006900 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Thomas Wouters477c8d52006-05-27 19:21:47 +00006903 result = stringlib_find_slice(
6904 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6905 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6906 start, end
6907 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 if (result < 0) {
6912 PyErr_SetString(PyExc_ValueError, "substring not found");
6913 return NULL;
6914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006915
Christian Heimes217cfd12007-12-02 14:31:20 +00006916 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917}
6918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
6925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006926unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6929 register const Py_UNICODE *e;
6930 int cased;
6931
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 /* Shortcut for single character strings */
6933 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006936 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006937 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 e = p + PyUnicode_GET_SIZE(self);
6941 cased = 0;
6942 for (; p < e; p++) {
6943 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 else if (!cased && Py_UNICODE_ISLOWER(ch))
6948 cased = 1;
6949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951}
6952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006953PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006956Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006960unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963 register const Py_UNICODE *e;
6964 int cased;
6965
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 /* Shortcut for single character strings */
6967 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006970 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006971 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 e = p + PyUnicode_GET_SIZE(self);
6975 cased = 0;
6976 for (; p < e; p++) {
6977 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 else if (!cased && Py_UNICODE_ISUPPER(ch))
6982 cased = 1;
6983 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006990Return True if S is a titlecased string and there is at least one\n\
6991character in S, i.e. upper- and titlecase characters may only\n\
6992follow uncased characters and lowercase characters only cased ones.\n\
6993Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
6998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6999 register const Py_UNICODE *e;
7000 int cased, previous_is_cased;
7001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 /* Shortcut for single character strings */
7003 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007004 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7005 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007007 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007008 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 e = p + PyUnicode_GET_SIZE(self);
7012 cased = 0;
7013 previous_is_cased = 0;
7014 for (; p < e; p++) {
7015 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7018 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 previous_is_cased = 1;
7021 cased = 1;
7022 }
7023 else if (Py_UNICODE_ISLOWER(ch)) {
7024 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 previous_is_cased = 1;
7027 cased = 1;
7028 }
7029 else
7030 previous_is_cased = 0;
7031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007032 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033}
7034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007036"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007038Return True if all characters in S are whitespace\n\
7039and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7045 register const Py_UNICODE *e;
7046
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 /* Shortcut for single character strings */
7048 if (PyUnicode_GET_SIZE(self) == 1 &&
7049 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007050 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007052 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007053 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 e = p + PyUnicode_GET_SIZE(self);
7057 for (; p < e; p++) {
7058 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007059 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007061 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007064PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007065"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007067Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007069
7070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007071unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007072{
7073 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7074 register const Py_UNICODE *e;
7075
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007076 /* Shortcut for single character strings */
7077 if (PyUnicode_GET_SIZE(self) == 1 &&
7078 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007079 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007080
7081 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007082 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007083 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007084
7085 e = p + PyUnicode_GET_SIZE(self);
7086 for (; p < e; p++) {
7087 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007088 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007090 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007091}
7092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007093PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007094"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007095\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007096Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007098
7099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007100unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007101{
7102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7103 register const Py_UNICODE *e;
7104
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007105 /* Shortcut for single character strings */
7106 if (PyUnicode_GET_SIZE(self) == 1 &&
7107 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007108 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007109
7110 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007111 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007113
7114 e = p + PyUnicode_GET_SIZE(self);
7115 for (; p < e; p++) {
7116 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007117 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007119 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007120}
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007123"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
7128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007129unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
7131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7132 register const Py_UNICODE *e;
7133
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 /* Shortcut for single character strings */
7135 if (PyUnicode_GET_SIZE(self) == 1 &&
7136 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007139 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007140 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 e = p + PyUnicode_GET_SIZE(self);
7144 for (; p < e; p++) {
7145 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007146 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007148 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149}
7150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007152"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007154Return True if all characters in S are digits\n\
7155and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007158unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
7160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7161 register const Py_UNICODE *e;
7162
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 /* Shortcut for single character strings */
7164 if (PyUnicode_GET_SIZE(self) == 1 &&
7165 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007168 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007169 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007171
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 e = p + PyUnicode_GET_SIZE(self);
7173 for (; p < e; p++) {
7174 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007181"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007183Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007184False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007187unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188{
7189 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7190 register const Py_UNICODE *e;
7191
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 /* Shortcut for single character strings */
7193 if (PyUnicode_GET_SIZE(self) == 1 &&
7194 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007195 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007197 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007198 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007199 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 e = p + PyUnicode_GET_SIZE(self);
7202 for (; p < e; p++) {
7203 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007206 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Martin v. Löwis47383402007-08-15 07:32:56 +00007209int
7210PyUnicode_IsIdentifier(PyObject *self)
7211{
7212 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7213 register const Py_UNICODE *e;
7214
7215 /* Special case for empty strings */
7216 if (PyUnicode_GET_SIZE(self) == 0)
7217 return 0;
7218
7219 /* PEP 3131 says that the first character must be in
7220 XID_Start and subsequent characters in XID_Continue,
7221 and for the ASCII range, the 2.x rules apply (i.e
7222 start with letters and underscore, continue with
7223 letters, digits, underscore). However, given the current
7224 definition of XID_Start and XID_Continue, it is sufficient
7225 to check just for these, except that _ must be allowed
7226 as starting an identifier. */
7227 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7228 return 0;
7229
7230 e = p + PyUnicode_GET_SIZE(self);
7231 for (p++; p < e; p++) {
7232 if (!_PyUnicode_IsXidContinue(*p))
7233 return 0;
7234 }
7235 return 1;
7236}
7237
7238PyDoc_STRVAR(isidentifier__doc__,
7239"S.isidentifier() -> bool\n\
7240\n\
7241Return True if S is a valid identifier according\n\
7242to the language definition.");
7243
7244static PyObject*
7245unicode_isidentifier(PyObject *self)
7246{
7247 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7248}
7249
Georg Brandl559e5d72008-06-11 18:37:52 +00007250PyDoc_STRVAR(isprintable__doc__,
7251"S.isprintable() -> bool\n\
7252\n\
7253Return True if all characters in S are considered\n\
7254printable in repr() or S is empty, False otherwise.");
7255
7256static PyObject*
7257unicode_isprintable(PyObject *self)
7258{
7259 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7260 register const Py_UNICODE *e;
7261
7262 /* Shortcut for single character strings */
7263 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7264 Py_RETURN_TRUE;
7265 }
7266
7267 e = p + PyUnicode_GET_SIZE(self);
7268 for (; p < e; p++) {
7269 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7270 Py_RETURN_FALSE;
7271 }
7272 }
7273 Py_RETURN_TRUE;
7274}
7275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007277"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278\n\
7279Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
7282static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007283unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007285 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286}
7287
Martin v. Löwis18e16552006-02-15 17:27:45 +00007288static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289unicode_length(PyUnicodeObject *self)
7290{
7291 return self->length;
7292}
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007295"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296\n\
7297Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007298done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299
7300static PyObject *
7301unicode_ljust(PyUnicodeObject *self, PyObject *args)
7302{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007303 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007304 Py_UNICODE fillchar = ' ';
7305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007306 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return NULL;
7308
Tim Peters7a29bd52001-09-12 03:03:31 +00007309 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 Py_INCREF(self);
7311 return (PyObject*) self;
7312 }
7313
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007314 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007318"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321
7322static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007323unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 return fixup(self, fixlower);
7326}
7327
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007328#define LEFTSTRIP 0
7329#define RIGHTSTRIP 1
7330#define BOTHSTRIP 2
7331
7332/* Arrays indexed by above */
7333static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7334
7335#define STRIPNAME(i) (stripformat[i]+3)
7336
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007337/* externally visible for str.strip(unicode) */
7338PyObject *
7339_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7340{
7341 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007342 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007344 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7345 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007346
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7348
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007349 i = 0;
7350 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7352 i++;
7353 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007354 }
7355
7356 j = len;
7357 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007358 do {
7359 j--;
7360 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7361 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007362 }
7363
7364 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007365 Py_INCREF(self);
7366 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007367 }
7368 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007369 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007370}
7371
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
7373static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007374do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007376 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007377 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007378
7379 i = 0;
7380 if (striptype != RIGHTSTRIP) {
7381 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7382 i++;
7383 }
7384 }
7385
7386 j = len;
7387 if (striptype != LEFTSTRIP) {
7388 do {
7389 j--;
7390 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7391 j++;
7392 }
7393
7394 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7395 Py_INCREF(self);
7396 return (PyObject*)self;
7397 }
7398 else
7399 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007402
7403static PyObject *
7404do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7405{
7406 PyObject *sep = NULL;
7407
7408 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7409 return NULL;
7410
7411 if (sep != NULL && sep != Py_None) {
7412 if (PyUnicode_Check(sep))
7413 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007414 else {
7415 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007416 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007417 STRIPNAME(striptype));
7418 return NULL;
7419 }
7420 }
7421
7422 return do_strip(self, striptype);
7423}
7424
7425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007426PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007427"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007428\n\
7429Return a copy of the string S with leading and trailing\n\
7430whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007431If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007432
7433static PyObject *
7434unicode_strip(PyUnicodeObject *self, PyObject *args)
7435{
7436 if (PyTuple_GET_SIZE(args) == 0)
7437 return do_strip(self, BOTHSTRIP); /* Common case */
7438 else
7439 return do_argstrip(self, BOTHSTRIP, args);
7440}
7441
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007444"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007445\n\
7446Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007447If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007448
7449static PyObject *
7450unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7451{
7452 if (PyTuple_GET_SIZE(args) == 0)
7453 return do_strip(self, LEFTSTRIP); /* Common case */
7454 else
7455 return do_argstrip(self, LEFTSTRIP, args);
7456}
7457
7458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007459PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007460"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007461\n\
7462Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007463If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007464
7465static PyObject *
7466unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7467{
7468 if (PyTuple_GET_SIZE(args) == 0)
7469 return do_strip(self, RIGHTSTRIP); /* Common case */
7470 else
7471 return do_argstrip(self, RIGHTSTRIP, args);
7472}
7473
7474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007476unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477{
7478 PyUnicodeObject *u;
7479 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007480 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007481 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483 if (len < 0)
7484 len = 0;
7485
Tim Peters7a29bd52001-09-12 03:03:31 +00007486 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 /* no repeat, return original string */
7488 Py_INCREF(str);
7489 return (PyObject*) str;
7490 }
Tim Peters8f422462000-09-09 06:13:41 +00007491
7492 /* ensure # of chars needed doesn't overflow int and # of bytes
7493 * needed doesn't overflow size_t
7494 */
7495 nchars = len * str->length;
7496 if (len && nchars / len != str->length) {
7497 PyErr_SetString(PyExc_OverflowError,
7498 "repeated string is too long");
7499 return NULL;
7500 }
7501 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7502 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7503 PyErr_SetString(PyExc_OverflowError,
7504 "repeated string is too long");
7505 return NULL;
7506 }
7507 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 if (!u)
7509 return NULL;
7510
7511 p = u->str;
7512
Thomas Wouters477c8d52006-05-27 19:21:47 +00007513 if (str->length == 1 && len > 0) {
7514 Py_UNICODE_FILL(p, str->str[0], len);
7515 } else {
7516 Py_ssize_t done = 0; /* number of characters copied this far */
7517 if (done < nchars) {
7518 Py_UNICODE_COPY(p, str->str, str->length);
7519 done = str->length;
7520 }
7521 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007522 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007523 Py_UNICODE_COPY(p+done, p, n);
7524 done += n;
7525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
7527
7528 return (PyObject*) u;
7529}
7530
7531PyObject *PyUnicode_Replace(PyObject *obj,
7532 PyObject *subobj,
7533 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007534 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535{
7536 PyObject *self;
7537 PyObject *str1;
7538 PyObject *str2;
7539 PyObject *result;
7540
7541 self = PyUnicode_FromObject(obj);
7542 if (self == NULL)
7543 return NULL;
7544 str1 = PyUnicode_FromObject(subobj);
7545 if (str1 == NULL) {
7546 Py_DECREF(self);
7547 return NULL;
7548 }
7549 str2 = PyUnicode_FromObject(replobj);
7550 if (str2 == NULL) {
7551 Py_DECREF(self);
7552 Py_DECREF(str1);
7553 return NULL;
7554 }
Tim Petersced69f82003-09-16 20:30:58 +00007555 result = replace((PyUnicodeObject *)self,
7556 (PyUnicodeObject *)str1,
7557 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 maxcount);
7559 Py_DECREF(self);
7560 Py_DECREF(str1);
7561 Py_DECREF(str2);
7562 return result;
7563}
7564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007565PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007566"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567\n\
7568Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007569old replaced by new. If the optional argument count is\n\
7570given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
7573unicode_replace(PyUnicodeObject *self, PyObject *args)
7574{
7575 PyUnicodeObject *str1;
7576 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007577 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 PyObject *result;
7579
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 return NULL;
7582 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7583 if (str1 == NULL)
7584 return NULL;
7585 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007586 if (str2 == NULL) {
7587 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591 result = replace(self, str1, str2, maxcount);
7592
7593 Py_DECREF(str1);
7594 Py_DECREF(str2);
7595 return result;
7596}
7597
7598static
7599PyObject *unicode_repr(PyObject *unicode)
7600{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007601 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007602 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007603 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7604 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7605
7606 /* XXX(nnorwitz): rather than over-allocating, it would be
7607 better to choose a different scheme. Perhaps scan the
7608 first N-chars of the string and allocate based on that size.
7609 */
7610 /* Initial allocation is based on the longest-possible unichr
7611 escape.
7612
7613 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7614 unichr, so in this case it's the longest unichr escape. In
7615 narrow (UTF-16) builds this is five chars per source unichr
7616 since there are two unichrs in the surrogate pair, so in narrow
7617 (UTF-16) builds it's not the longest unichr escape.
7618
7619 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7620 so in the narrow (UTF-16) build case it's the longest unichr
7621 escape.
7622 */
7623
Walter Dörwald1ab83302007-05-18 17:15:44 +00007624 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007625 2 /* quotes */
7626#ifdef Py_UNICODE_WIDE
7627 + 10*size
7628#else
7629 + 6*size
7630#endif
7631 + 1);
7632 if (repr == NULL)
7633 return NULL;
7634
Walter Dörwald1ab83302007-05-18 17:15:44 +00007635 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007636
7637 /* Add quote */
7638 *p++ = (findchar(s, size, '\'') &&
7639 !findchar(s, size, '"')) ? '"' : '\'';
7640 while (size-- > 0) {
7641 Py_UNICODE ch = *s++;
7642
7643 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007644 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007645 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007646 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007647 continue;
7648 }
7649
Georg Brandl559e5d72008-06-11 18:37:52 +00007650 /* Map special whitespace to '\t', \n', '\r' */
7651 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007652 *p++ = '\\';
7653 *p++ = 't';
7654 }
7655 else if (ch == '\n') {
7656 *p++ = '\\';
7657 *p++ = 'n';
7658 }
7659 else if (ch == '\r') {
7660 *p++ = '\\';
7661 *p++ = 'r';
7662 }
7663
7664 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007665 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007666 *p++ = '\\';
7667 *p++ = 'x';
7668 *p++ = hexdigits[(ch >> 4) & 0x000F];
7669 *p++ = hexdigits[ch & 0x000F];
7670 }
7671
Georg Brandl559e5d72008-06-11 18:37:52 +00007672 /* Copy ASCII characters as-is */
7673 else if (ch < 0x7F) {
7674 *p++ = ch;
7675 }
7676
7677 /* Non-ASCII characters */
7678 else {
7679 Py_UCS4 ucs = ch;
7680
7681#ifndef Py_UNICODE_WIDE
7682 Py_UNICODE ch2 = 0;
7683 /* Get code point from surrogate pair */
7684 if (size > 0) {
7685 ch2 = *s;
7686 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7687 && ch2 <= 0xDFFF) {
7688 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7689 + 0x00010000;
7690 s++;
7691 size--;
7692 }
7693 }
7694#endif
7695 /* Map Unicode whitespace and control characters
7696 (categories Z* and C* except ASCII space)
7697 */
7698 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7699 /* Map 8-bit characters to '\xhh' */
7700 if (ucs <= 0xff) {
7701 *p++ = '\\';
7702 *p++ = 'x';
7703 *p++ = hexdigits[(ch >> 4) & 0x000F];
7704 *p++ = hexdigits[ch & 0x000F];
7705 }
7706 /* Map 21-bit characters to '\U00xxxxxx' */
7707 else if (ucs >= 0x10000) {
7708 *p++ = '\\';
7709 *p++ = 'U';
7710 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7711 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7712 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7713 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7714 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7715 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7716 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7717 *p++ = hexdigits[ucs & 0x0000000F];
7718 }
7719 /* Map 16-bit characters to '\uxxxx' */
7720 else {
7721 *p++ = '\\';
7722 *p++ = 'u';
7723 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7724 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7725 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7726 *p++ = hexdigits[ucs & 0x000F];
7727 }
7728 }
7729 /* Copy characters as-is */
7730 else {
7731 *p++ = ch;
7732#ifndef Py_UNICODE_WIDE
7733 if (ucs >= 0x10000)
7734 *p++ = ch2;
7735#endif
7736 }
7737 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007738 }
7739 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007740 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007741
7742 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007743 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007744 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007748"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749\n\
7750Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007751such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752arguments start and end are interpreted as in slice notation.\n\
7753\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject *
7757unicode_rfind(PyUnicodeObject *self, PyObject *args)
7758{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007759 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007760 Py_ssize_t start;
7761 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007762 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
Christian Heimes9cd17752007-11-18 19:35:23 +00007764 if (!_ParseTupleFinds(args, &substring, &start, &end))
7765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
Thomas Wouters477c8d52006-05-27 19:21:47 +00007767 result = stringlib_rfind_slice(
7768 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7769 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7770 start, end
7771 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
7773 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007774
Christian Heimes217cfd12007-12-02 14:31:20 +00007775 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776}
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007779"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
7783static PyObject *
7784unicode_rindex(PyUnicodeObject *self, PyObject *args)
7785{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007786 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007787 Py_ssize_t start;
7788 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007789 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
Christian Heimes9cd17752007-11-18 19:35:23 +00007791 if (!_ParseTupleFinds(args, &substring, &start, &end))
7792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Thomas Wouters477c8d52006-05-27 19:21:47 +00007794 result = stringlib_rfind_slice(
7795 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7796 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7797 start, end
7798 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
7800 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (result < 0) {
7803 PyErr_SetString(PyExc_ValueError, "substring not found");
7804 return NULL;
7805 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007806 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807}
7808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007809PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007810"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007812Return S right justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007813done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
7815static PyObject *
7816unicode_rjust(PyUnicodeObject *self, PyObject *args)
7817{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007818 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007819 Py_UNICODE fillchar = ' ';
7820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007821 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 return NULL;
7823
Tim Peters7a29bd52001-09-12 03:03:31 +00007824 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 Py_INCREF(self);
7826 return (PyObject*) self;
7827 }
7828
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007829 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832PyObject *PyUnicode_Split(PyObject *s,
7833 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
7836 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007837
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 s = PyUnicode_FromObject(s);
7839 if (s == NULL)
7840 return NULL;
7841 if (sep != NULL) {
7842 sep = PyUnicode_FromObject(sep);
7843 if (sep == NULL) {
7844 Py_DECREF(s);
7845 return NULL;
7846 }
7847 }
7848
7849 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7850
7851 Py_DECREF(s);
7852 Py_XDECREF(sep);
7853 return result;
7854}
7855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007856PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007857"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858\n\
7859Return a list of the words in S, using sep as the\n\
7860delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007861splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007862whitespace string is a separator and empty strings are\n\
7863removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865static PyObject*
7866unicode_split(PyUnicodeObject *self, PyObject *args)
7867{
7868 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 return NULL;
7873
7874 if (substring == Py_None)
7875 return split(self, NULL, maxcount);
7876 else if (PyUnicode_Check(substring))
7877 return split(self, (PyUnicodeObject *)substring, maxcount);
7878 else
7879 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7880}
7881
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882PyObject *
7883PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7884{
7885 PyObject* str_obj;
7886 PyObject* sep_obj;
7887 PyObject* out;
7888
7889 str_obj = PyUnicode_FromObject(str_in);
7890 if (!str_obj)
7891 return NULL;
7892 sep_obj = PyUnicode_FromObject(sep_in);
7893 if (!sep_obj) {
7894 Py_DECREF(str_obj);
7895 return NULL;
7896 }
7897
7898 out = stringlib_partition(
7899 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7900 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7901 );
7902
7903 Py_DECREF(sep_obj);
7904 Py_DECREF(str_obj);
7905
7906 return out;
7907}
7908
7909
7910PyObject *
7911PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7912{
7913 PyObject* str_obj;
7914 PyObject* sep_obj;
7915 PyObject* out;
7916
7917 str_obj = PyUnicode_FromObject(str_in);
7918 if (!str_obj)
7919 return NULL;
7920 sep_obj = PyUnicode_FromObject(sep_in);
7921 if (!sep_obj) {
7922 Py_DECREF(str_obj);
7923 return NULL;
7924 }
7925
7926 out = stringlib_rpartition(
7927 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7928 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7929 );
7930
7931 Py_DECREF(sep_obj);
7932 Py_DECREF(str_obj);
7933
7934 return out;
7935}
7936
7937PyDoc_STRVAR(partition__doc__,
7938"S.partition(sep) -> (head, sep, tail)\n\
7939\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007940Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007941the separator itself, and the part after it. If the separator is not\n\
7942found, returns S and two empty strings.");
7943
7944static PyObject*
7945unicode_partition(PyUnicodeObject *self, PyObject *separator)
7946{
7947 return PyUnicode_Partition((PyObject *)self, separator);
7948}
7949
7950PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007951"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007953Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007955separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007956
7957static PyObject*
7958unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7959{
7960 return PyUnicode_RPartition((PyObject *)self, separator);
7961}
7962
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007963PyObject *PyUnicode_RSplit(PyObject *s,
7964 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007965 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007966{
7967 PyObject *result;
7968
7969 s = PyUnicode_FromObject(s);
7970 if (s == NULL)
7971 return NULL;
7972 if (sep != NULL) {
7973 sep = PyUnicode_FromObject(sep);
7974 if (sep == NULL) {
7975 Py_DECREF(s);
7976 return NULL;
7977 }
7978 }
7979
7980 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7981
7982 Py_DECREF(s);
7983 Py_XDECREF(sep);
7984 return result;
7985}
7986
7987PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007988"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007989\n\
7990Return a list of the words in S, using sep as the\n\
7991delimiter string, starting at the end of the string and\n\
7992working to the front. If maxsplit is given, at most maxsplit\n\
7993splits are done. If sep is not specified, any whitespace string\n\
7994is a separator.");
7995
7996static PyObject*
7997unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7998{
7999 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008001
Martin v. Löwis18e16552006-02-15 17:27:45 +00008002 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008003 return NULL;
8004
8005 if (substring == Py_None)
8006 return rsplit(self, NULL, maxcount);
8007 else if (PyUnicode_Check(substring))
8008 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8009 else
8010 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8011}
8012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008013PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00008014"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015\n\
8016Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008017Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
8020static PyObject*
8021unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8022{
Guido van Rossum86662912000-04-11 15:38:46 +00008023 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
Guido van Rossum86662912000-04-11 15:38:46 +00008025 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 return NULL;
8027
Guido van Rossum86662912000-04-11 15:38:46 +00008028 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029}
8030
8031static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008032PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
Walter Dörwald346737f2007-05-31 10:44:43 +00008034 if (PyUnicode_CheckExact(self)) {
8035 Py_INCREF(self);
8036 return self;
8037 } else
8038 /* Subtype -- return genuine unicode string with the same value. */
8039 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8040 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041}
8042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008043PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008044"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045\n\
8046Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008047and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
8049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008050unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return fixup(self, fixswapcase);
8053}
8054
Georg Brandlceee0772007-11-27 23:48:05 +00008055PyDoc_STRVAR(maketrans__doc__,
8056"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8057\n\
8058Return a translation table usable for str.translate().\n\
8059If there is only one argument, it must be a dictionary mapping Unicode\n\
8060ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008061Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008062If there are two arguments, they must be strings of equal length, and\n\
8063in the resulting dictionary, each character in x will be mapped to the\n\
8064character at the same position in y. If there is a third argument, it\n\
8065must be a string, whose characters will be mapped to None in the result.");
8066
8067static PyObject*
8068unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8069{
8070 PyObject *x, *y = NULL, *z = NULL;
8071 PyObject *new = NULL, *key, *value;
8072 Py_ssize_t i = 0;
8073 int res;
8074
8075 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8076 return NULL;
8077 new = PyDict_New();
8078 if (!new)
8079 return NULL;
8080 if (y != NULL) {
8081 /* x must be a string too, of equal length */
8082 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8083 if (!PyUnicode_Check(x)) {
8084 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8085 "be a string if there is a second argument");
8086 goto err;
8087 }
8088 if (PyUnicode_GET_SIZE(x) != ylen) {
8089 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8090 "arguments must have equal length");
8091 goto err;
8092 }
8093 /* create entries for translating chars in x to those in y */
8094 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8096 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008097 if (!key || !value)
8098 goto err;
8099 res = PyDict_SetItem(new, key, value);
8100 Py_DECREF(key);
8101 Py_DECREF(value);
8102 if (res < 0)
8103 goto err;
8104 }
8105 /* create entries for deleting chars in z */
8106 if (z != NULL) {
8107 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008108 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008109 if (!key)
8110 goto err;
8111 res = PyDict_SetItem(new, key, Py_None);
8112 Py_DECREF(key);
8113 if (res < 0)
8114 goto err;
8115 }
8116 }
8117 } else {
8118 /* x must be a dict */
8119 if (!PyDict_Check(x)) {
8120 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8121 "to maketrans it must be a dict");
8122 goto err;
8123 }
8124 /* copy entries into the new dict, converting string keys to int keys */
8125 while (PyDict_Next(x, &i, &key, &value)) {
8126 if (PyUnicode_Check(key)) {
8127 /* convert string keys to integer keys */
8128 PyObject *newkey;
8129 if (PyUnicode_GET_SIZE(key) != 1) {
8130 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8131 "table must be of length 1");
8132 goto err;
8133 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008134 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008135 if (!newkey)
8136 goto err;
8137 res = PyDict_SetItem(new, newkey, value);
8138 Py_DECREF(newkey);
8139 if (res < 0)
8140 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008141 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008142 /* just keep integer keys */
8143 if (PyDict_SetItem(new, key, value) < 0)
8144 goto err;
8145 } else {
8146 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8147 "be strings or integers");
8148 goto err;
8149 }
8150 }
8151 }
8152 return new;
8153 err:
8154 Py_DECREF(new);
8155 return NULL;
8156}
8157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008158PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008159"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160\n\
8161Return a copy of the string S, where all characters have been mapped\n\
8162through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008163Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008164Unmapped characters are left untouched. Characters mapped to None\n\
8165are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008168unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169{
Georg Brandlceee0772007-11-27 23:48:05 +00008170 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171}
8172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008173PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008174"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008176Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
8178static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008179unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 return fixup(self, fixupper);
8182}
8183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008184PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008185"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186\n\
8187Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008188of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189
8190static PyObject *
8191unicode_zfill(PyUnicodeObject *self, PyObject *args)
8192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008193 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 PyUnicodeObject *u;
8195
Martin v. Löwis18e16552006-02-15 17:27:45 +00008196 Py_ssize_t width;
8197 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 return NULL;
8199
8200 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008201 if (PyUnicode_CheckExact(self)) {
8202 Py_INCREF(self);
8203 return (PyObject*) self;
8204 }
8205 else
8206 return PyUnicode_FromUnicode(
8207 PyUnicode_AS_UNICODE(self),
8208 PyUnicode_GET_SIZE(self)
8209 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 }
8211
8212 fill = width - self->length;
8213
8214 u = pad(self, fill, 0, '0');
8215
Walter Dörwald068325e2002-04-15 13:36:47 +00008216 if (u == NULL)
8217 return NULL;
8218
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 if (u->str[fill] == '+' || u->str[fill] == '-') {
8220 /* move sign to beginning of string */
8221 u->str[0] = u->str[fill];
8222 u->str[fill] = '0';
8223 }
8224
8225 return (PyObject*) u;
8226}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
8228#if 0
8229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008230unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231{
Christian Heimes2202f872008-02-06 14:31:34 +00008232 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233}
8234#endif
8235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008236PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008237"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008239Return True if S starts with the specified prefix, False otherwise.\n\
8240With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008241With optional end, stop comparing S at that position.\n\
8242prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
8244static PyObject *
8245unicode_startswith(PyUnicodeObject *self,
8246 PyObject *args)
8247{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008248 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008250 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008251 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008252 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008254 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008255 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008257 if (PyTuple_Check(subobj)) {
8258 Py_ssize_t i;
8259 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8260 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8261 PyTuple_GET_ITEM(subobj, i));
8262 if (substring == NULL)
8263 return NULL;
8264 result = tailmatch(self, substring, start, end, -1);
8265 Py_DECREF(substring);
8266 if (result) {
8267 Py_RETURN_TRUE;
8268 }
8269 }
8270 /* nothing matched */
8271 Py_RETURN_FALSE;
8272 }
8273 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008275 return NULL;
8276 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008278 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279}
8280
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008283"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008285Return True if S ends with the specified suffix, False otherwise.\n\
8286With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008287With optional end, stop comparing S at that position.\n\
8288suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289
8290static PyObject *
8291unicode_endswith(PyUnicodeObject *self,
8292 PyObject *args)
8293{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008294 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008297 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008298 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008300 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8301 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008303 if (PyTuple_Check(subobj)) {
8304 Py_ssize_t i;
8305 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8306 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8307 PyTuple_GET_ITEM(subobj, i));
8308 if (substring == NULL)
8309 return NULL;
8310 result = tailmatch(self, substring, start, end, +1);
8311 Py_DECREF(substring);
8312 if (result) {
8313 Py_RETURN_TRUE;
8314 }
8315 }
8316 Py_RETURN_FALSE;
8317 }
8318 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008322 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008324 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325}
8326
Eric Smith8c663262007-08-25 02:26:07 +00008327#include "stringlib/string_format.h"
8328
8329PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008330"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008331\n\
8332");
8333
Eric Smith4a7d76d2008-05-30 18:10:19 +00008334static PyObject *
8335unicode__format__(PyObject* self, PyObject* args)
8336{
8337 PyObject *format_spec;
8338
8339 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8340 return NULL;
8341
8342 return _PyUnicode_FormatAdvanced(self,
8343 PyUnicode_AS_UNICODE(format_spec),
8344 PyUnicode_GET_SIZE(format_spec));
8345}
8346
Eric Smith8c663262007-08-25 02:26:07 +00008347PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008348"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008349\n\
8350");
8351
8352static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008353unicode__sizeof__(PyUnicodeObject *v)
8354{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008355 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8356 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008357}
8358
8359PyDoc_STRVAR(sizeof__doc__,
8360"S.__sizeof__() -> size of S in memory, in bytes");
8361
8362static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008363unicode_getnewargs(PyUnicodeObject *v)
8364{
8365 return Py_BuildValue("(u#)", v->str, v->length);
8366}
8367
8368
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369static PyMethodDef unicode_methods[] = {
8370
8371 /* Order is according to common usage: often used methods should
8372 appear first, since lookup is done sequentially. */
8373
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008374 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8375 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8376 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008377 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008378 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8379 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8380 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8381 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8382 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8383 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8384 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008386 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8387 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8388 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008389 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008390 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8391 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8392 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008393 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008395 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008396 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008397 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8398 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8399 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8400 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8401 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8402 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8403 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8404 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8405 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8406 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8407 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8408 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8409 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8410 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008411 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008412 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008413 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008414 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008415 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008416 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8417 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008418 {"maketrans", (PyCFunction) unicode_maketrans,
8419 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008420 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008421#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008422 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423#endif
8424
8425#if 0
8426 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008427 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428#endif
8429
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008430 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 {NULL, NULL}
8432};
8433
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008434static PyObject *
8435unicode_mod(PyObject *v, PyObject *w)
8436{
8437 if (!PyUnicode_Check(v)) {
8438 Py_INCREF(Py_NotImplemented);
8439 return Py_NotImplemented;
8440 }
8441 return PyUnicode_Format(v, w);
8442}
8443
8444static PyNumberMethods unicode_as_number = {
8445 0, /*nb_add*/
8446 0, /*nb_subtract*/
8447 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008448 unicode_mod, /*nb_remainder*/
8449};
8450
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008453 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008454 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8455 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008456 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 0, /* sq_ass_item */
8458 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008459 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460};
8461
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008462static PyObject*
8463unicode_subscript(PyUnicodeObject* self, PyObject* item)
8464{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008465 if (PyIndex_Check(item)) {
8466 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008467 if (i == -1 && PyErr_Occurred())
8468 return NULL;
8469 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008470 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008471 return unicode_getitem(self, i);
8472 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008474 Py_UNICODE* source_buf;
8475 Py_UNICODE* result_buf;
8476 PyObject* result;
8477
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008478 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008479 &start, &stop, &step, &slicelength) < 0) {
8480 return NULL;
8481 }
8482
8483 if (slicelength <= 0) {
8484 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008485 } else if (start == 0 && step == 1 && slicelength == self->length &&
8486 PyUnicode_CheckExact(self)) {
8487 Py_INCREF(self);
8488 return (PyObject *)self;
8489 } else if (step == 1) {
8490 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008491 } else {
8492 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008493 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8494 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008495
8496 if (result_buf == NULL)
8497 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008498
8499 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8500 result_buf[i] = source_buf[cur];
8501 }
Tim Petersced69f82003-09-16 20:30:58 +00008502
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008503 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008504 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008505 return result;
8506 }
8507 } else {
8508 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8509 return NULL;
8510 }
8511}
8512
8513static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008514 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008515 (binaryfunc)unicode_subscript, /* mp_subscript */
8516 (objobjargproc)0, /* mp_ass_subscript */
8517};
8518
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520/* Helpers for PyUnicode_Format() */
8521
8522static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008525 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 if (argidx < arglen) {
8527 (*p_argidx)++;
8528 if (arglen < 0)
8529 return args;
8530 else
8531 return PyTuple_GetItem(args, argidx);
8532 }
8533 PyErr_SetString(PyExc_TypeError,
8534 "not enough arguments for format string");
8535 return NULL;
8536}
8537
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008539strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 register Py_ssize_t i;
8542 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 for (i = len - 1; i >= 0; i--)
8544 buffer[i] = (Py_UNICODE) charbuffer[i];
8545
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 return len;
8547}
8548
Neal Norwitzfc76d632006-01-10 06:03:13 +00008549static int
8550doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8551{
Tim Peters15231542006-02-16 01:08:01 +00008552 Py_ssize_t result;
8553
Neal Norwitzfc76d632006-01-10 06:03:13 +00008554 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008555 result = strtounicode(buffer, (char *)buffer);
8556 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008557}
8558
Christian Heimes3fd13992008-03-21 01:05:49 +00008559#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008560static int
8561longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8562{
Tim Peters15231542006-02-16 01:08:01 +00008563 Py_ssize_t result;
8564
Neal Norwitzfc76d632006-01-10 06:03:13 +00008565 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008566 result = strtounicode(buffer, (char *)buffer);
8567 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008568}
Christian Heimes3fd13992008-03-21 01:05:49 +00008569#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008570
Guido van Rossum078151d2002-08-11 04:24:12 +00008571/* XXX To save some code duplication, formatfloat/long/int could have been
8572 shared with stringobject.c, converting from 8-bit to Unicode after the
8573 formatting is done. */
8574
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575static int
8576formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008577 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 int flags,
8579 int prec,
8580 int type,
8581 PyObject *v)
8582{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008583 /* fmt = '%#.' + `prec` + `type`
8584 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 char fmt[20];
8586 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008587
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 x = PyFloat_AsDouble(v);
8589 if (x == -1.0 && PyErr_Occurred())
8590 return -1;
8591 if (prec < 0)
8592 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008593 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8594 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008595 /* Worst case length calc to ensure no buffer overrun:
8596
8597 'g' formats:
8598 fmt = %#.<prec>g
8599 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8600 for any double rep.)
8601 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8602
8603 'f' formats:
8604 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8605 len = 1 + 50 + 1 + prec = 52 + prec
8606
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008607 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008608 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008609
8610 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008611 if (((type == 'g' || type == 'G') &&
8612 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008613 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008614 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008615 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008616 return -1;
8617 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008618 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8619 (flags&F_ALT) ? "#" : "",
8620 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008621 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622}
8623
Tim Peters38fd5b62000-09-21 05:43:11 +00008624static PyObject*
8625formatlong(PyObject *val, int flags, int prec, int type)
8626{
8627 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008628 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008629 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008630 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008631
Christian Heimes72b710a2008-05-26 13:28:38 +00008632 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008633 if (!str)
8634 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008635 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008636 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008637 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008638}
8639
Christian Heimes3fd13992008-03-21 01:05:49 +00008640#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641static int
8642formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008643 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 int flags,
8645 int prec,
8646 int type,
8647 PyObject *v)
8648{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008649 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008650 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8651 * + 1 + 1
8652 * = 24
8653 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008654 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008655 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 long x;
8657
Christian Heimes217cfd12007-12-02 14:31:20 +00008658 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008660 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008661 if (x < 0 && type == 'u') {
8662 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008663 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008664 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8665 sign = "-";
8666 else
8667 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008669 prec = 1;
8670
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008671 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8672 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008673 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008674 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008675 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008676 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008677 return -1;
8678 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679
8680 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008681 (type == 'x' || type == 'X' || type == 'o')) {
8682 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008683 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008684 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008685 * - when 0 is being converted, the C standard leaves off
8686 * the '0x' or '0X', which is inconsistent with other
8687 * %#x/%#X conversions and inconsistent with Python's
8688 * hex() function
8689 * - there are platforms that violate the standard and
8690 * convert 0 with the '0x' or '0X'
8691 * (Metrowerks, Compaq Tru64)
8692 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008693 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008694 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008695 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008696 * We can achieve the desired consistency by inserting our
8697 * own '0x' or '0X' prefix, and substituting %x/%X in place
8698 * of %#x/%#X.
8699 *
8700 * Note that this is the same approach as used in
8701 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008702 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008703 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8704 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008705 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008706 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008707 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8708 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008709 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008710 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008711 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008712 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008713 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008714 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715}
Christian Heimes3fd13992008-03-21 01:05:49 +00008716#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717
8718static int
8719formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008720 size_t buflen,
8721 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008723 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008724 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008725 if (PyUnicode_GET_SIZE(v) == 1) {
8726 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8727 buf[1] = '\0';
8728 return 1;
8729 }
8730#ifndef Py_UNICODE_WIDE
8731 if (PyUnicode_GET_SIZE(v) == 2) {
8732 /* Decode a valid surrogate pair */
8733 int c0 = PyUnicode_AS_UNICODE(v)[0];
8734 int c1 = PyUnicode_AS_UNICODE(v)[1];
8735 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8736 0xDC00 <= c1 && c1 <= 0xDFFF) {
8737 buf[0] = c0;
8738 buf[1] = c1;
8739 buf[2] = '\0';
8740 return 2;
8741 }
8742 }
8743#endif
8744 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 else {
8747 /* Integer input truncated to a character */
8748 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008749 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008751 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008752
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008753 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008754 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008755 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008756 return -1;
8757 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008758
8759#ifndef Py_UNICODE_WIDE
8760 if (x > 0xffff) {
8761 x -= 0x10000;
8762 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8763 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8764 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008765 }
8766#endif
8767 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008768 buf[1] = '\0';
8769 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008771
8772 onError:
8773 PyErr_SetString(PyExc_TypeError,
8774 "%c requires int or char");
8775 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776}
8777
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008778/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8779
8780 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8781 chars are formatted. XXX This is a magic number. Each formatting
8782 routine does bounds checking to ensure no overflow, but a better
8783 solution may be to malloc a buffer of appropriate size for each
8784 format. For now, the current solution is sufficient.
8785*/
8786#define FORMATBUFLEN (size_t)120
8787
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788PyObject *PyUnicode_Format(PyObject *format,
8789 PyObject *args)
8790{
8791 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008792 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 int args_owned = 0;
8794 PyUnicodeObject *result = NULL;
8795 PyObject *dict = NULL;
8796 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008797
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 if (format == NULL || args == NULL) {
8799 PyErr_BadInternalCall();
8800 return NULL;
8801 }
8802 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008803 if (uformat == NULL)
8804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 fmt = PyUnicode_AS_UNICODE(uformat);
8806 fmtcnt = PyUnicode_GET_SIZE(uformat);
8807
8808 reslen = rescnt = fmtcnt + 100;
8809 result = _PyUnicode_New(reslen);
8810 if (result == NULL)
8811 goto onError;
8812 res = PyUnicode_AS_UNICODE(result);
8813
8814 if (PyTuple_Check(args)) {
8815 arglen = PyTuple_Size(args);
8816 argidx = 0;
8817 }
8818 else {
8819 arglen = -1;
8820 argidx = -2;
8821 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008822 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008823 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 dict = args;
8825
8826 while (--fmtcnt >= 0) {
8827 if (*fmt != '%') {
8828 if (--rescnt < 0) {
8829 rescnt = fmtcnt + 100;
8830 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008831 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8834 --rescnt;
8835 }
8836 *res++ = *fmt++;
8837 }
8838 else {
8839 /* Got a format specifier */
8840 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 Py_UNICODE c = '\0';
8844 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008845 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 PyObject *v = NULL;
8847 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008848 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008851 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852
8853 fmt++;
8854 if (*fmt == '(') {
8855 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008856 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 PyObject *key;
8858 int pcount = 1;
8859
8860 if (dict == NULL) {
8861 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008862 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 goto onError;
8864 }
8865 ++fmt;
8866 --fmtcnt;
8867 keystart = fmt;
8868 /* Skip over balanced parentheses */
8869 while (pcount > 0 && --fmtcnt >= 0) {
8870 if (*fmt == ')')
8871 --pcount;
8872 else if (*fmt == '(')
8873 ++pcount;
8874 fmt++;
8875 }
8876 keylen = fmt - keystart - 1;
8877 if (fmtcnt < 0 || pcount > 0) {
8878 PyErr_SetString(PyExc_ValueError,
8879 "incomplete format key");
8880 goto onError;
8881 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008882#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008883 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 then looked up since Python uses strings to hold
8885 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008886 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 key = PyUnicode_EncodeUTF8(keystart,
8888 keylen,
8889 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008890#else
8891 key = PyUnicode_FromUnicode(keystart, keylen);
8892#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 if (key == NULL)
8894 goto onError;
8895 if (args_owned) {
8896 Py_DECREF(args);
8897 args_owned = 0;
8898 }
8899 args = PyObject_GetItem(dict, key);
8900 Py_DECREF(key);
8901 if (args == NULL) {
8902 goto onError;
8903 }
8904 args_owned = 1;
8905 arglen = -1;
8906 argidx = -2;
8907 }
8908 while (--fmtcnt >= 0) {
8909 switch (c = *fmt++) {
8910 case '-': flags |= F_LJUST; continue;
8911 case '+': flags |= F_SIGN; continue;
8912 case ' ': flags |= F_BLANK; continue;
8913 case '#': flags |= F_ALT; continue;
8914 case '0': flags |= F_ZERO; continue;
8915 }
8916 break;
8917 }
8918 if (c == '*') {
8919 v = getnextarg(args, arglen, &argidx);
8920 if (v == NULL)
8921 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008922 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 PyErr_SetString(PyExc_TypeError,
8924 "* wants int");
8925 goto onError;
8926 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008927 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008928 if (width == -1 && PyErr_Occurred())
8929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 if (width < 0) {
8931 flags |= F_LJUST;
8932 width = -width;
8933 }
8934 if (--fmtcnt >= 0)
8935 c = *fmt++;
8936 }
8937 else if (c >= '0' && c <= '9') {
8938 width = c - '0';
8939 while (--fmtcnt >= 0) {
8940 c = *fmt++;
8941 if (c < '0' || c > '9')
8942 break;
8943 if ((width*10) / 10 != width) {
8944 PyErr_SetString(PyExc_ValueError,
8945 "width too big");
8946 goto onError;
8947 }
8948 width = width*10 + (c - '0');
8949 }
8950 }
8951 if (c == '.') {
8952 prec = 0;
8953 if (--fmtcnt >= 0)
8954 c = *fmt++;
8955 if (c == '*') {
8956 v = getnextarg(args, arglen, &argidx);
8957 if (v == NULL)
8958 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008959 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 PyErr_SetString(PyExc_TypeError,
8961 "* wants int");
8962 goto onError;
8963 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008964 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008965 if (prec == -1 && PyErr_Occurred())
8966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 if (prec < 0)
8968 prec = 0;
8969 if (--fmtcnt >= 0)
8970 c = *fmt++;
8971 }
8972 else if (c >= '0' && c <= '9') {
8973 prec = c - '0';
8974 while (--fmtcnt >= 0) {
8975 c = Py_CHARMASK(*fmt++);
8976 if (c < '0' || c > '9')
8977 break;
8978 if ((prec*10) / 10 != prec) {
8979 PyErr_SetString(PyExc_ValueError,
8980 "prec too big");
8981 goto onError;
8982 }
8983 prec = prec*10 + (c - '0');
8984 }
8985 }
8986 } /* prec */
8987 if (fmtcnt >= 0) {
8988 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 if (--fmtcnt >= 0)
8990 c = *fmt++;
8991 }
8992 }
8993 if (fmtcnt < 0) {
8994 PyErr_SetString(PyExc_ValueError,
8995 "incomplete format");
8996 goto onError;
8997 }
8998 if (c != '%') {
8999 v = getnextarg(args, arglen, &argidx);
9000 if (v == NULL)
9001 goto onError;
9002 }
9003 sign = 0;
9004 fill = ' ';
9005 switch (c) {
9006
9007 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009008 pbuf = formatbuf;
9009 /* presume that buffer length is at least 1 */
9010 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 len = 1;
9012 break;
9013
9014 case 's':
9015 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009016 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 if (PyUnicode_Check(v) && c == 's') {
9018 temp = v;
9019 Py_INCREF(temp);
9020 }
9021 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009023 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009024 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009026 else
9027 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 if (temp == NULL)
9029 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009030 if (PyUnicode_Check(temp))
9031 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009032 else {
9033 Py_DECREF(temp);
9034 PyErr_SetString(PyExc_TypeError,
9035 "%s argument has non-string str()");
9036 goto onError;
9037 }
9038 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009039 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 len = PyUnicode_GET_SIZE(temp);
9041 if (prec >= 0 && len > prec)
9042 len = prec;
9043 break;
9044
9045 case 'i':
9046 case 'd':
9047 case 'u':
9048 case 'o':
9049 case 'x':
9050 case 'X':
9051 if (c == 'i')
9052 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009053 isnumok = 0;
9054 if (PyNumber_Check(v)) {
9055 PyObject *iobj=NULL;
9056
9057 if (PyLong_Check(v)) {
9058 iobj = v;
9059 Py_INCREF(iobj);
9060 }
9061 else {
9062 iobj = PyNumber_Long(v);
9063 }
9064 if (iobj!=NULL) {
9065 if (PyLong_Check(iobj)) {
9066 isnumok = 1;
9067 temp = formatlong(iobj, flags, prec, c);
9068 Py_DECREF(iobj);
9069 if (!temp)
9070 goto onError;
9071 pbuf = PyUnicode_AS_UNICODE(temp);
9072 len = PyUnicode_GET_SIZE(temp);
9073 sign = 1;
9074 }
9075 else {
9076 Py_DECREF(iobj);
9077 }
9078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009080 if (!isnumok) {
9081 PyErr_Format(PyExc_TypeError,
9082 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009083 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009084 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009085 }
9086 if (flags & F_ZERO)
9087 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 break;
9089
9090 case 'e':
9091 case 'E':
9092 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009093 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 case 'g':
9095 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009096 if (c == 'F')
9097 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009098 pbuf = formatbuf;
9099 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9100 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 if (len < 0)
9102 goto onError;
9103 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009104 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 fill = '0';
9106 break;
9107
9108 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009109 pbuf = formatbuf;
9110 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 if (len < 0)
9112 goto onError;
9113 break;
9114
9115 default:
9116 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009117 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009118 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009119 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009120 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009121 (Py_ssize_t)(fmt - 1 -
9122 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 goto onError;
9124 }
9125 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009126 if (*pbuf == '-' || *pbuf == '+') {
9127 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 len--;
9129 }
9130 else if (flags & F_SIGN)
9131 sign = '+';
9132 else if (flags & F_BLANK)
9133 sign = ' ';
9134 else
9135 sign = 0;
9136 }
9137 if (width < len)
9138 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009139 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 reslen -= rescnt;
9141 rescnt = width + fmtcnt + 100;
9142 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009143 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009144 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009145 PyErr_NoMemory();
9146 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009147 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009148 if (_PyUnicode_Resize(&result, reslen) < 0) {
9149 Py_XDECREF(temp);
9150 goto onError;
9151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 res = PyUnicode_AS_UNICODE(result)
9153 + reslen - rescnt;
9154 }
9155 if (sign) {
9156 if (fill != ' ')
9157 *res++ = sign;
9158 rescnt--;
9159 if (width > len)
9160 width--;
9161 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009162 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009163 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009164 assert(pbuf[1] == c);
9165 if (fill != ' ') {
9166 *res++ = *pbuf++;
9167 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009168 }
Tim Petersfff53252001-04-12 18:38:48 +00009169 rescnt -= 2;
9170 width -= 2;
9171 if (width < 0)
9172 width = 0;
9173 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 if (width > len && !(flags & F_LJUST)) {
9176 do {
9177 --rescnt;
9178 *res++ = fill;
9179 } while (--width > len);
9180 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009181 if (fill == ' ') {
9182 if (sign)
9183 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009184 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009185 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009186 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009187 *res++ = *pbuf++;
9188 *res++ = *pbuf++;
9189 }
9190 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009191 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 res += len;
9193 rescnt -= len;
9194 while (--width >= len) {
9195 --rescnt;
9196 *res++ = ' ';
9197 }
9198 if (dict && (argidx < arglen) && c != '%') {
9199 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009200 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009201 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 goto onError;
9203 }
9204 Py_XDECREF(temp);
9205 } /* '%' */
9206 } /* until end */
9207 if (argidx < arglen && !dict) {
9208 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009209 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 goto onError;
9211 }
9212
Thomas Woutersa96affe2006-03-12 00:29:36 +00009213 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9214 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 if (args_owned) {
9216 Py_DECREF(args);
9217 }
9218 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 return (PyObject *)result;
9220
9221 onError:
9222 Py_XDECREF(result);
9223 Py_DECREF(uformat);
9224 if (args_owned) {
9225 Py_DECREF(args);
9226 }
9227 return NULL;
9228}
9229
Jeremy Hylton938ace62002-07-17 16:30:39 +00009230static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009231unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9232
Tim Peters6d6c1a32001-08-02 04:15:00 +00009233static PyObject *
9234unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9235{
9236 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009237 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009238 char *encoding = NULL;
9239 char *errors = NULL;
9240
Guido van Rossume023fe02001-08-30 03:12:59 +00009241 if (type != &PyUnicode_Type)
9242 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009243 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009244 kwlist, &x, &encoding, &errors))
9245 return NULL;
9246 if (x == NULL)
9247 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009248 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009249 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009250 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009251 return PyUnicode_FromEncodedObject(x, encoding, errors);
9252}
9253
Guido van Rossume023fe02001-08-30 03:12:59 +00009254static PyObject *
9255unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9256{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009257 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009258 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009259
9260 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9261 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9262 if (tmp == NULL)
9263 return NULL;
9264 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009265 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009266 if (pnew == NULL) {
9267 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009268 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009269 }
Christian Heimesb186d002008-03-18 15:15:01 +00009270 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009271 if (pnew->str == NULL) {
9272 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009273 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009274 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009275 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009276 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009277 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9278 pnew->length = n;
9279 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009280 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009281 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009282}
9283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009284PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009285"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009286\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009287Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009288encoding defaults to the current default string encoding.\n\
9289errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009290
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009291static PyObject *unicode_iter(PyObject *seq);
9292
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009294 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009295 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 sizeof(PyUnicodeObject), /* tp_size */
9297 0, /* tp_itemsize */
9298 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009299 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009301 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009303 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009304 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009305 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009307 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 (hashfunc) unicode_hash, /* tp_hash*/
9309 0, /* tp_call*/
9310 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009311 PyObject_GenericGetAttr, /* tp_getattro */
9312 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009313 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009314 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9315 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009316 unicode_doc, /* tp_doc */
9317 0, /* tp_traverse */
9318 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009319 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009320 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009321 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009322 0, /* tp_iternext */
9323 unicode_methods, /* tp_methods */
9324 0, /* tp_members */
9325 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009326 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009327 0, /* tp_dict */
9328 0, /* tp_descr_get */
9329 0, /* tp_descr_set */
9330 0, /* tp_dictoffset */
9331 0, /* tp_init */
9332 0, /* tp_alloc */
9333 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009334 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335};
9336
9337/* Initialize the Unicode implementation */
9338
Thomas Wouters78890102000-07-22 19:25:51 +00009339void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009341 int i;
9342
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343 /* XXX - move this array to unicodectype.c ? */
9344 Py_UNICODE linebreak[] = {
9345 0x000A, /* LINE FEED */
9346 0x000D, /* CARRIAGE RETURN */
9347 0x001C, /* FILE SEPARATOR */
9348 0x001D, /* GROUP SEPARATOR */
9349 0x001E, /* RECORD SEPARATOR */
9350 0x0085, /* NEXT LINE */
9351 0x2028, /* LINE SEPARATOR */
9352 0x2029, /* PARAGRAPH SEPARATOR */
9353 };
9354
Fred Drakee4315f52000-05-09 19:53:39 +00009355 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009356 free_list = NULL;
9357 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009359 if (!unicode_empty)
9360 return;
9361
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009362 for (i = 0; i < 256; i++)
9363 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009364 if (PyType_Ready(&PyUnicode_Type) < 0)
9365 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009366
9367 /* initialize the linebreak bloom filter */
9368 bloom_linebreak = make_bloom_mask(
9369 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9370 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009371
9372 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373}
9374
9375/* Finalize the Unicode implementation */
9376
Christian Heimesa156e092008-02-16 07:38:31 +00009377int
9378PyUnicode_ClearFreeList(void)
9379{
9380 int freelist_size = numfree;
9381 PyUnicodeObject *u;
9382
9383 for (u = free_list; u != NULL;) {
9384 PyUnicodeObject *v = u;
9385 u = *(PyUnicodeObject **)u;
9386 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009387 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009388 Py_XDECREF(v->defenc);
9389 PyObject_Del(v);
9390 numfree--;
9391 }
9392 free_list = NULL;
9393 assert(numfree == 0);
9394 return freelist_size;
9395}
9396
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397void
Thomas Wouters78890102000-07-22 19:25:51 +00009398_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009400 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009402 Py_XDECREF(unicode_empty);
9403 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009404
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009405 for (i = 0; i < 256; i++) {
9406 if (unicode_latin1[i]) {
9407 Py_DECREF(unicode_latin1[i]);
9408 unicode_latin1[i] = NULL;
9409 }
9410 }
Christian Heimesa156e092008-02-16 07:38:31 +00009411 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009413
Walter Dörwald16807132007-05-25 13:52:07 +00009414void
9415PyUnicode_InternInPlace(PyObject **p)
9416{
9417 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9418 PyObject *t;
9419 if (s == NULL || !PyUnicode_Check(s))
9420 Py_FatalError(
9421 "PyUnicode_InternInPlace: unicode strings only please!");
9422 /* If it's a subclass, we don't really know what putting
9423 it in the interned dict might do. */
9424 if (!PyUnicode_CheckExact(s))
9425 return;
9426 if (PyUnicode_CHECK_INTERNED(s))
9427 return;
9428 if (interned == NULL) {
9429 interned = PyDict_New();
9430 if (interned == NULL) {
9431 PyErr_Clear(); /* Don't leave an exception */
9432 return;
9433 }
9434 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009435 /* It might be that the GetItem call fails even
9436 though the key is present in the dictionary,
9437 namely when this happens during a stack overflow. */
9438 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009439 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009440 Py_END_ALLOW_RECURSION
9441
Walter Dörwald16807132007-05-25 13:52:07 +00009442 if (t) {
9443 Py_INCREF(t);
9444 Py_DECREF(*p);
9445 *p = t;
9446 return;
9447 }
9448
Martin v. Löwis5b222132007-06-10 09:51:05 +00009449 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009450 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9451 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009452 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009453 return;
9454 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009455 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009456 /* The two references in interned are not counted by refcnt.
9457 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009458 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009459 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9460}
9461
9462void
9463PyUnicode_InternImmortal(PyObject **p)
9464{
9465 PyUnicode_InternInPlace(p);
9466 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9467 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9468 Py_INCREF(*p);
9469 }
9470}
9471
9472PyObject *
9473PyUnicode_InternFromString(const char *cp)
9474{
9475 PyObject *s = PyUnicode_FromString(cp);
9476 if (s == NULL)
9477 return NULL;
9478 PyUnicode_InternInPlace(&s);
9479 return s;
9480}
9481
9482void _Py_ReleaseInternedUnicodeStrings(void)
9483{
9484 PyObject *keys;
9485 PyUnicodeObject *s;
9486 Py_ssize_t i, n;
9487 Py_ssize_t immortal_size = 0, mortal_size = 0;
9488
9489 if (interned == NULL || !PyDict_Check(interned))
9490 return;
9491 keys = PyDict_Keys(interned);
9492 if (keys == NULL || !PyList_Check(keys)) {
9493 PyErr_Clear();
9494 return;
9495 }
9496
9497 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9498 detector, interned unicode strings are not forcibly deallocated;
9499 rather, we give them their stolen references back, and then clear
9500 and DECREF the interned dict. */
9501
9502 n = PyList_GET_SIZE(keys);
9503 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9504 n);
9505 for (i = 0; i < n; i++) {
9506 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9507 switch (s->state) {
9508 case SSTATE_NOT_INTERNED:
9509 /* XXX Shouldn't happen */
9510 break;
9511 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009512 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009513 immortal_size += s->length;
9514 break;
9515 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009516 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009517 mortal_size += s->length;
9518 break;
9519 default:
9520 Py_FatalError("Inconsistent interned string state.");
9521 }
9522 s->state = SSTATE_NOT_INTERNED;
9523 }
9524 fprintf(stderr, "total size of all interned strings: "
9525 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9526 "mortal/immortal\n", mortal_size, immortal_size);
9527 Py_DECREF(keys);
9528 PyDict_Clear(interned);
9529 Py_DECREF(interned);
9530 interned = NULL;
9531}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009532
9533
9534/********************* Unicode Iterator **************************/
9535
9536typedef struct {
9537 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009538 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009539 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9540} unicodeiterobject;
9541
9542static void
9543unicodeiter_dealloc(unicodeiterobject *it)
9544{
9545 _PyObject_GC_UNTRACK(it);
9546 Py_XDECREF(it->it_seq);
9547 PyObject_GC_Del(it);
9548}
9549
9550static int
9551unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9552{
9553 Py_VISIT(it->it_seq);
9554 return 0;
9555}
9556
9557static PyObject *
9558unicodeiter_next(unicodeiterobject *it)
9559{
9560 PyUnicodeObject *seq;
9561 PyObject *item;
9562
9563 assert(it != NULL);
9564 seq = it->it_seq;
9565 if (seq == NULL)
9566 return NULL;
9567 assert(PyUnicode_Check(seq));
9568
9569 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009570 item = PyUnicode_FromUnicode(
9571 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009572 if (item != NULL)
9573 ++it->it_index;
9574 return item;
9575 }
9576
9577 Py_DECREF(seq);
9578 it->it_seq = NULL;
9579 return NULL;
9580}
9581
9582static PyObject *
9583unicodeiter_len(unicodeiterobject *it)
9584{
9585 Py_ssize_t len = 0;
9586 if (it->it_seq)
9587 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009588 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009589}
9590
9591PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9592
9593static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009594 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9595 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009596 {NULL, NULL} /* sentinel */
9597};
9598
9599PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009600 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009601 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009602 sizeof(unicodeiterobject), /* tp_basicsize */
9603 0, /* tp_itemsize */
9604 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009605 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009606 0, /* tp_print */
9607 0, /* tp_getattr */
9608 0, /* tp_setattr */
9609 0, /* tp_compare */
9610 0, /* tp_repr */
9611 0, /* tp_as_number */
9612 0, /* tp_as_sequence */
9613 0, /* tp_as_mapping */
9614 0, /* tp_hash */
9615 0, /* tp_call */
9616 0, /* tp_str */
9617 PyObject_GenericGetAttr, /* tp_getattro */
9618 0, /* tp_setattro */
9619 0, /* tp_as_buffer */
9620 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9621 0, /* tp_doc */
9622 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9623 0, /* tp_clear */
9624 0, /* tp_richcompare */
9625 0, /* tp_weaklistoffset */
9626 PyObject_SelfIter, /* tp_iter */
9627 (iternextfunc)unicodeiter_next, /* tp_iternext */
9628 unicodeiter_methods, /* tp_methods */
9629 0,
9630};
9631
9632static PyObject *
9633unicode_iter(PyObject *seq)
9634{
9635 unicodeiterobject *it;
9636
9637 if (!PyUnicode_Check(seq)) {
9638 PyErr_BadInternalCall();
9639 return NULL;
9640 }
9641 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9642 if (it == NULL)
9643 return NULL;
9644 it->it_index = 0;
9645 Py_INCREF(seq);
9646 it->it_seq = (PyUnicodeObject *)seq;
9647 _PyObject_GC_TRACK(it);
9648 return (PyObject *)it;
9649}
9650
Martin v. Löwis5b222132007-06-10 09:51:05 +00009651size_t
9652Py_UNICODE_strlen(const Py_UNICODE *u)
9653{
9654 int res = 0;
9655 while(*u++)
9656 res++;
9657 return res;
9658}
9659
9660Py_UNICODE*
9661Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9662{
9663 Py_UNICODE *u = s1;
9664 while ((*u++ = *s2++));
9665 return s1;
9666}
9667
9668Py_UNICODE*
9669Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9670{
9671 Py_UNICODE *u = s1;
9672 while ((*u++ = *s2++))
9673 if (n-- == 0)
9674 break;
9675 return s1;
9676}
9677
9678int
9679Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9680{
9681 while (*s1 && *s2 && *s1 == *s2)
9682 s1++, s2++;
9683 if (*s1 && *s2)
9684 return (*s1 < *s2) ? -1 : +1;
9685 if (*s1)
9686 return 1;
9687 if (*s2)
9688 return -1;
9689 return 0;
9690}
9691
9692Py_UNICODE*
9693Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9694{
9695 const Py_UNICODE *p;
9696 for (p = s; *p; p++)
9697 if (*p == c)
9698 return (Py_UNICODE*)p;
9699 return NULL;
9700}
9701
9702
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009703#ifdef __cplusplus
9704}
9705#endif
9706
9707
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009708/*
9709Local variables:
9710c-basic-offset: 4
9711indent-tabs-mode: nil
9712End:
9713*/