blob: e57b60cab96d08275b8a5f557b9cf4172f9b328a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
428 PyErr_BadInternalCall();
429 return -1;
430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 PyErr_BadInternalCall();
434 return -1;
435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000447 Py_DECREF(*unicode);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000448 *unicode = w;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 return 0;
450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
475 }
476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 if (!unicode)
484 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000485 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
510 "Negative size passed to PyUnicode_FromStringAndSize");
511 return NULL;
512 }
513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
524 }
525
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000529 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000534 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566{
567 PyUnicodeObject *unicode;
568
569 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000570 if (size == 0)
571 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572 PyErr_BadInternalCall();
573 return NULL;
574 }
575
Martin v. Löwis790465f2008-04-05 20:41:37 +0000576 if (size == -1) {
577 size = wcslen(w);
578 }
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 unicode = _PyUnicode_New(size);
581 if (!unicode)
582 return NULL;
583
584 /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000587#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 {
589 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000592 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 *u++ = *w++;
594 }
595#endif
596
597 return (PyObject *)unicode;
598}
599
Walter Dörwald346737f2007-05-31 10:44:43 +0000600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603 *fmt++ = '%';
604 if (width) {
605 if (zeropad)
606 *fmt++ = '0';
607 fmt += sprintf(fmt, "%d", width);
608 }
609 if (precision)
610 fmt += sprintf(fmt, ".%d", precision);
611 if (longflag)
612 *fmt++ = 'l';
613 else if (size_tflag) {
614 char *f = PY_FORMAT_SIZE_T;
615 while (*f)
616 *fmt++ = *f++;
617 }
618 *fmt++ = c;
619 *fmt = '\0';
620}
621
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000628 Py_ssize_t callcount = 0;
629 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000630 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000631 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000632 int width = 0;
633 int precision = 0;
634 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000635 const char* f;
636 Py_UNICODE *s;
637 PyObject *string;
638 /* used by sprintf */
639 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000640 /* use abuffer instead of buffer, if we need more space
641 * (which can happen if there's a format specifier with width). */
642 char *abuffer = NULL;
643 char *realbuffer;
644 Py_ssize_t abuffersize = 0;
645 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646 const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649 Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef __va_copy
652 __va_copy(count, vargs);
653#else
654 count = vargs;
655#endif
656#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000657 /* step 1: count the number of %S/%R/%A format specifications
658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659 * these objects once during step 3 and put the result in
660 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000663 ++callcount;
664 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000665 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000666 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000669 if (!callresults) {
670 PyErr_NoMemory();
671 return NULL;
672 }
673 callresult = callresults;
674 }
675 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 for (f = format; *f; f++) {
677 if (*f == '%') {
678 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000681 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683 ;
684
685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686 * they don't affect the amount of space we reserve.
687 */
688 if ((*f == 'l' || *f == 'z') &&
689 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000690 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691
692 switch (*f) {
693 case 'c':
694 (void)va_arg(count, int);
695 /* fall through... */
696 case '%':
697 n++;
698 break;
699 case 'd': case 'u': case 'i': case 'x':
700 (void) va_arg(count, int);
701 /* 20 bytes is enough to hold a 64-bit
702 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000703 This isn't enough for octal.
704 If a width is specified we need more
705 (which we allocate later). */
706 if (width < 20)
707 width = 20;
708 n += width;
709 if (abuffersize < width)
710 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711 break;
712 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000713 {
714 /* UTF-8 */
715 unsigned char*s;
716 s = va_arg(count, unsigned char*);
717 while (*s) {
718 if (*s < 128) {
719 n++; s++;
720 } else if (*s < 0xc0) {
721 /* invalid UTF-8 */
722 n++; s++;
723 } else if (*s < 0xc0) {
724 n++;
725 s++; if(!*s)break;
726 s++;
727 } else if (*s < 0xe0) {
728 n++;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 } else {
733 #ifdef Py_UNICODE_WIDE
734 n++;
735 #else
736 n+=2;
737 #endif
738 s++; if(!*s)break;
739 s++; if(!*s)break;
740 s++; if(!*s)break;
741 s++;
742 }
743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000745 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 case 'U':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 assert(obj && PyUnicode_Check(obj));
750 n += PyUnicode_GET_SIZE(obj);
751 break;
752 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000753 case 'V':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 const char *str = va_arg(count, const char *);
757 assert(obj || str);
758 assert(!obj || PyUnicode_Check(obj));
759 if (obj)
760 n += PyUnicode_GET_SIZE(obj);
761 else
762 n += strlen(str);
763 break;
764 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000765 case 'S':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 PyObject *str;
769 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000770 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000771 if (!str)
772 goto fail;
773 n += PyUnicode_GET_SIZE(str);
774 /* Remember the str and switch to the next slot */
775 *callresult++ = str;
776 break;
777 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000778 case 'R':
779 {
780 PyObject *obj = va_arg(count, PyObject *);
781 PyObject *repr;
782 assert(obj);
783 repr = PyObject_Repr(obj);
784 if (!repr)
785 goto fail;
786 n += PyUnicode_GET_SIZE(repr);
787 /* Remember the repr and switch to the next slot */
788 *callresult++ = repr;
789 break;
790 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000791 case 'A':
792 {
793 PyObject *obj = va_arg(count, PyObject *);
794 PyObject *ascii;
795 assert(obj);
796 ascii = PyObject_ASCII(obj);
797 if (!ascii)
798 goto fail;
799 n += PyUnicode_GET_SIZE(ascii);
800 /* Remember the repr and switch to the next slot */
801 *callresult++ = ascii;
802 break;
803 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 case 'p':
805 (void) va_arg(count, int);
806 /* maximum 64-bit pointer representation:
807 * 0xffffffffffffffff
808 * so 19 characters is enough.
809 * XXX I count 18 -- what's the extra for?
810 */
811 n += 19;
812 break;
813 default:
814 /* if we stumble upon an unknown
815 formatting code, copy the rest of
816 the format string to the output
817 string. (we cannot just skip the
818 code, since there's no way to know
819 what's in the argument list) */
820 n += strlen(p);
821 goto expand;
822 }
823 } else
824 n++;
825 }
826 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000828 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000829 if (!abuffer) {
830 PyErr_NoMemory();
831 goto fail;
832 }
833 realbuffer = abuffer;
834 }
835 else
836 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 we don't have to resize the string.
840 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 string = PyUnicode_FromUnicode(NULL, n);
842 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844
845 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000846 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847
848 for (f = format; *f; f++) {
849 if (*f == '%') {
850 const char* p = f++;
851 int longflag = 0;
852 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 zeropad = (*f == '0');
854 /* parse the width.precision part */
855 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000856 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 width = (width*10) + *f++ - '0';
858 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (*f == '.') {
860 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000861 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 /* handle the long flag, but only for %ld and %lu.
865 others can be added when necessary. */
866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867 longflag = 1;
868 ++f;
869 }
870 /* handle the size_t flag. */
871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872 size_tflag = 1;
873 ++f;
874 }
875
876 switch (*f) {
877 case 'c':
878 *s++ = va_arg(vargs, int);
879 break;
880 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000887 sprintf(realbuffer, fmt, va_arg(vargs, int));
888 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 break;
890 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
905 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 break;
910 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000911 {
912 /* Parameter must be UTF-8 encoded.
913 In case of encoding errors, use
914 the replacement character. */
915 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000917 u = PyUnicode_DecodeUTF8(p, strlen(p),
918 "replace");
919 if (!u)
920 goto fail;
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922 PyUnicode_GET_SIZE(u));
923 s += PyUnicode_GET_SIZE(u);
924 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000926 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000927 case 'U':
928 {
929 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000933 break;
934 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000935 case 'V':
936 {
937 PyObject *obj = va_arg(vargs, PyObject *);
938 const char *str = va_arg(vargs, const char *);
939 if (obj) {
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 } else {
944 appendstring(str);
945 }
946 break;
947 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000948 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000949 case 'R':
950 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000951 Py_UNICODE *ucopy;
952 Py_ssize_t usize;
953 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 /* unused, since we already have the result */
955 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000956 ucopy = PyUnicode_AS_UNICODE(*callresult);
957 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 for (upos = 0; upos<usize;)
959 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000962 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000963 ++callresult;
964 break;
965 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966 case 'p':
967 sprintf(buffer, "%p", va_arg(vargs, void*));
968 /* %p is ill-defined: ensure leading 0x. */
969 if (buffer[1] == 'X')
970 buffer[1] = 'x';
971 else if (buffer[1] != 'x') {
972 memmove(buffer+2, buffer, strlen(buffer)+1);
973 buffer[0] = '0';
974 buffer[1] = 'x';
975 }
976 appendstring(buffer);
977 break;
978 case '%':
979 *s++ = '%';
980 break;
981 default:
982 appendstring(p);
983 goto end;
984 }
985 } else
986 *s++ = *f;
987 }
988
989 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000990 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000992 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000993 PyObject_Free(abuffer);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000996 fail:
997 if (callresults) {
998 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000999 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 Py_DECREF(*callresult2);
1001 ++callresult2;
1002 }
Christian Heimesb186d002008-03-18 15:15:01 +00001003 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001004 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001005 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001006 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001007 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015 PyObject* ret;
1016 va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019 va_start(vargs, format);
1020#else
1021 va_start(vargs);
1022#endif
1023 ret = PyUnicode_FromFormatV(format, vargs);
1024 va_end(vargs);
1025 return ret;
1026}
1027
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029 wchar_t *w,
1030 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031{
1032 if (unicode == NULL) {
1033 PyErr_BadInternalCall();
1034 return -1;
1035 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001036
1037 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039 size = PyUnicode_GET_SIZE(unicode) + 1;
1040
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041#ifdef HAVE_USABLE_WCHAR_T
1042 memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044 {
1045 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001046 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001048 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 *w++ = *u++;
1050 }
1051#endif
1052
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001053 if (size > PyUnicode_GET_SIZE(unicode))
1054 return PyUnicode_GET_SIZE(unicode);
1055 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 return size;
1057}
1058
1059#endif
1060
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065 if (ordinal < 0 || ordinal > 0x10ffff) {
1066 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001067 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 return NULL;
1069 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001070
1071#ifndef Py_UNICODE_WIDE
1072 if (ordinal > 0xffff) {
1073 ordinal -= 0x10000;
1074 s[0] = 0xD800 | (ordinal >> 10);
1075 s[1] = 0xDC00 | (ordinal & 0x3FF);
1076 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#endif
1079
Hye-Shik Chang40574832004-04-06 07:24:51 +00001080 s[0] = (Py_UNICODE)ordinal;
1081 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001082}
1083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 if (PyUnicode_CheckExact(obj)) {
1089 Py_INCREF(obj);
1090 return obj;
1091 }
1092 if (PyUnicode_Check(obj)) {
1093 /* For a Unicode subtype that's not a Unicode object,
1094 return a true Unicode object with the same data. */
1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096 PyUnicode_GET_SIZE(obj));
1097 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001098 PyErr_Format(PyExc_TypeError,
1099 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001100 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001101 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105 const char *encoding,
1106 const char *errors)
1107{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (obj == NULL) {
1113 PyErr_BadInternalCall();
1114 return NULL;
1115 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001117 if (PyUnicode_Check(obj)) {
1118 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001119 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122
1123 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001124 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001125 s = PyBytes_AS_STRING(obj);
1126 len = PyBytes_GET_SIZE(obj);
1127 }
1128 else if (PyByteArray_Check(obj)) {
1129 s = PyByteArray_AS_STRING(obj);
1130 len = PyByteArray_GET_SIZE(obj);
1131 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001132 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133 /* Overwrite the error message with something more useful in
1134 case of a TypeError. */
1135 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001137 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001138 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001139 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001140 goto onError;
1141 }
Tim Petersced69f82003-09-16 20:30:58 +00001142
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001143 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 if (len == 0) {
1145 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001146 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 }
Tim Petersced69f82003-09-16 20:30:58 +00001148 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001150
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001151 return v;
1152
1153 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 const char *encoding,
1160 const char *errors)
1161{
1162 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001163 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 char lower[20]; /* Enough for any encoding name we recognize */
1165 char *l;
1166 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
1168 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 encoding = PyUnicode_GetDefaultEncoding();
1170
1171 /* Convert encoding to lower case and replace '_' with '-' in order to
1172 catch e.g. UTF_8 */
1173 e = encoding;
1174 l = lower;
1175 while (*e && l < &lower[(sizeof lower) - 2]) {
1176 if (ISUPPER(*e)) {
1177 *l++ = TOLOWER(*e++);
1178 }
1179 else if (*e == '_') {
1180 *l++ = '-';
1181 e++;
1182 }
1183 else {
1184 *l++ = *e++;
1185 }
1186 }
1187 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001188
1189 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if ((strcmp(lower, "latin-1") == 0) ||
1193 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001196 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001201 else if (strcmp(lower, "utf-16") == 0)
1202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203 else if (strcmp(lower, "utf-32") == 0)
1204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (buffer == NULL)
1212 goto onError;
1213 unicode = PyCodec_Decode(buffer, encoding, errors);
1214 if (unicode == NULL)
1215 goto onError;
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_DECREF(unicode);
1221 goto onError;
1222 }
1223 Py_DECREF(buffer);
1224 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 onError:
1227 Py_XDECREF(buffer);
1228 return NULL;
1229}
1230
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v;
1236
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Decode via the codec registry */
1246 v = PyCodec_Decode(unicode, encoding, errors);
1247 if (v == NULL)
1248 goto onError;
1249 return v;
1250
1251 onError:
1252 return NULL;
1253}
1254
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256 const char *encoding,
1257 const char *errors)
1258{
1259 PyObject *v;
1260
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 goto onError;
1264 }
1265
1266 if (encoding == NULL)
1267 encoding = PyUnicode_GetDefaultEncoding();
1268
1269 /* Decode via the codec registry */
1270 v = PyCodec_Decode(unicode, encoding, errors);
1271 if (v == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(v)) {
1274 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001275 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001276 Py_TYPE(v)->tp_name);
1277 Py_DECREF(v);
1278 goto onError;
1279 }
1280 return v;
1281
1282 onError:
1283 return NULL;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 unicode = PyUnicode_FromUnicode(s, size);
1294 if (unicode == NULL)
1295 return NULL;
1296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297 Py_DECREF(unicode);
1298 return v;
1299}
1300
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302 const char *encoding,
1303 const char *errors)
1304{
1305 PyObject *v;
1306
1307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadArgument();
1309 goto onError;
1310 }
1311
1312 if (encoding == NULL)
1313 encoding = PyUnicode_GetDefaultEncoding();
1314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 return v;
1320
1321 onError:
1322 return NULL;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326 const char *encoding,
1327 const char *errors)
1328{
1329 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (!PyUnicode_Check(unicode)) {
1332 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 }
Fred Drakee4315f52000-05-09 19:53:39 +00001335
Tim Petersced69f82003-09-16 20:30:58 +00001336 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001337 encoding = PyUnicode_GetDefaultEncoding();
1338
1339 /* Shortcuts for common default encodings */
1340 if (errors == NULL) {
1341 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001342 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001343 else if (strcmp(encoding, "latin-1") == 0)
1344 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346 else if (strcmp(encoding, "mbcs") == 0)
1347 return PyUnicode_AsMBCSString(unicode);
1348#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001349 else if (strcmp(encoding, "ascii") == 0)
1350 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001351 /* During bootstrap, we may need to find the encodings
1352 package, to load the file system encoding, and require the
1353 file system encoding in order to load the encodings
1354 package.
1355
1356 Break out of this dependency by assuming that the path to
1357 the encodings module is ASCII-only. XXX could try wcstombs
1358 instead, if the file system encoding is the locale's
1359 encoding. */
1360 else if (Py_FileSystemDefaultEncoding &&
1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362 !PyThreadState_GET()->interp->codecs_initialized)
1363 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
1366 /* Encode via the codec registry */
1367 v = PyCodec_Encode(unicode, encoding, errors);
1368 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001369 return NULL;
1370
1371 /* The normal path */
1372 if (PyBytes_Check(v))
1373 return v;
1374
1375 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376 if (PyByteArray_Check(v)) {
1377 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001378 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001379 PyOS_snprintf(msg, sizeof(msg),
1380 "encoder %s returned buffer instead of bytes",
1381 encoding);
1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001383 Py_DECREF(v);
1384 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001385 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388 Py_DECREF(v);
1389 return b;
1390 }
1391
1392 PyErr_Format(PyExc_TypeError,
1393 "encoder did not return a bytes object (type=%.400s)",
1394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
1411 encoding = PyUnicode_GetDefaultEncoding();
1412
1413 /* Encode via the codec registry */
1414 v = PyCodec_Encode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 onError:
1427 return NULL;
1428}
1429
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431 const char *errors)
1432{
1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001434 if (v)
1435 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001436 if (errors != NULL)
1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001439 PyUnicode_GET_SIZE(unicode),
1440 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001441 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001442 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001443 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001444 return v;
1445}
1446
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001448PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001450 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001452
Christian Heimes5894ba72007-11-04 11:43:14 +00001453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457 can be undefined. If it is case, decode using UTF-8. The following assumes
1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459 bootstrapping process where the codecs aren't ready yet.
1460 */
1461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001464 return PyUnicode_DecodeMBCS(s, size, "replace");
1465 }
1466#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001468 return PyUnicode_DecodeUTF8(s, size, "replace");
1469 }
1470#endif
1471 return PyUnicode_Decode(s, size,
1472 Py_FileSystemDefaultEncoding,
1473 "replace");
1474 }
1475 else {
1476 return PyUnicode_DecodeUTF8(s, size, "replace");
1477 }
1478}
1479
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001482{
Christian Heimesf3863112007-11-22 07:46:41 +00001483 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
1486 return NULL;
1487 }
Christian Heimesf3863112007-11-22 07:46:41 +00001488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001490 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001491 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001492 *psize = PyBytes_GET_SIZE(bytes);
1493 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001494}
1495
1496char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001498{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001499 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001500}
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 goto onError;
1507 }
1508 return PyUnicode_AS_UNICODE(unicode);
1509
1510 onError:
1511 return NULL;
1512}
1513
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515{
1516 if (!PyUnicode_Check(unicode)) {
1517 PyErr_BadArgument();
1518 goto onError;
1519 }
1520 return PyUnicode_GET_SIZE(unicode);
1521
1522 onError:
1523 return -1;
1524}
1525
Thomas Wouters78890102000-07-22 19:25:51 +00001526const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001527{
1528 return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001533 if (strcmp(encoding, unicode_default_encoding) != 0) {
1534 PyErr_Format(PyExc_ValueError,
1535 "Can only set default encoding to %s",
1536 unicode_default_encoding);
1537 return -1;
1538 }
Fred Drakee4315f52000-05-09 19:53:39 +00001539 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001540}
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542/* error handling callback helper:
1543 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001544 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 and adjust various state variables.
1546 return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001552 const char **input, const char **inend, Py_ssize_t *startinpos,
1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557
1558 PyObject *restuple = NULL;
1559 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t requiredsize;
1563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001565 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 int res = -1;
1568
1569 if (*errorHandler == NULL) {
1570 *errorHandler = PyCodec_LookupError(errors);
1571 if (*errorHandler == NULL)
1572 goto onError;
1573 }
1574
1575 if (*exceptionObject == NULL) {
1576 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 if (*exceptionObject == NULL)
1579 goto onError;
1580 }
1581 else {
1582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585 goto onError;
1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587 goto onError;
1588 }
1589
1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591 if (restuple == NULL)
1592 goto onError;
1593 if (!PyTuple_Check(restuple)) {
1594 PyErr_Format(PyExc_TypeError, &argparse[4]);
1595 goto onError;
1596 }
1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
1600 /* Copy back the bytes variables, which might have been modified by the
1601 callback */
1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603 if (!inputobj)
1604 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001605 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001608 *input = PyBytes_AS_STRING(inputobj);
1609 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001610 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001611 /* we can DECREF safely, as the exception has another reference,
1612 so the object won't go away. */
1613 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001616 newpos = insize+newpos;
1617 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001619 goto onError;
1620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 /* need more space? (at least enough for what we
1623 have+the replacement+the rest of the string (starting
1624 at the new input position), so we won't have to check space
1625 when there are no errors in the rest of the string) */
1626 repptr = PyUnicode_AS_UNICODE(repunicode);
1627 repsize = PyUnicode_GET_SIZE(repunicode);
1628 requiredsize = *outpos + repsize + insize-newpos;
1629 if (requiredsize > outsize) {
1630 if (requiredsize<2*outsize)
1631 requiredsize = 2*outsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632 if (_PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 goto onError;
1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635 }
1636 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001637 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_UNICODE_COPY(*outptr, repptr, repsize);
1639 *outptr += repsize;
1640 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 /* we made it! */
1643 res = 0;
1644
1645 onError:
1646 Py_XDECREF(restuple);
1647 return res;
1648}
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
Tim Petersced69f82003-09-16 20:30:58 +00001654static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655char utf7_special[128] = {
1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657 encoded:
1658 0 - not special
1659 1 - special
1660 2 - whitespace (optional)
1661 3 - RFC2152 Set O (optional) */
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674 warnings about the comparison always being false; since
1675 utf7_special[0] is 1, we can safely make that one comparison
1676 true */
1677
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001680 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 (encodeO && (utf7_special[(c)] == 3)))
1682
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001683#define B64(n) \
1684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001686 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001687#define UB64(c) \
1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001691#define ENCODE(out, ch, bits) \
1692 while (bits >= 6) { \
1693 *out++ = B64(ch >> (bits-6)); \
1694 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001697#define DECODE(out, ch, bits, surrogate) \
1698 while (bits >= 16) { \
1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1700 bits -= 16; \
1701 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001702 /* We have already generated an error for the high surrogate \
1703 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001704 surrogate = 0; \
1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001707 it in a 16-bit character */ \
1708 surrogate = 1; \
1709 errmsg = "code pairs are not supported"; \
1710 goto utf7Error; \
1711 } else { \
1712 *out++ = outCh; \
1713 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 const char *errors)
1719{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724 Py_ssize_t size,
1725 const char *errors,
1726 Py_ssize_t *consumed)
1727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729 Py_ssize_t startinpos;
1730 Py_ssize_t endinpos;
1731 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 const char *e;
1733 PyUnicodeObject *unicode;
1734 Py_UNICODE *p;
1735 const char *errmsg = "";
1736 int inShift = 0;
1737 unsigned int bitsleft = 0;
1738 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int surrogate = 0;
1740 PyObject *errorHandler = NULL;
1741 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
1743 unicode = _PyUnicode_New(size);
1744 if (!unicode)
1745 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001746 if (size == 0) {
1747 if (consumed)
1748 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
1752 p = unicode->str;
1753 e = s + size;
1754
1755 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_UNICODE ch;
1757 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001758 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
1760 if (inShift) {
1761 if ((ch == '-') || !B64CHAR(ch)) {
1762 inShift = 0;
1763 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766 if (bitsleft >= 6) {
1767 /* The shift sequence has a partial character in it. If
1768 bitsleft < 6 then we could just classify it as padding
1769 but that is not the case here */
1770
1771 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001772 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 }
1774 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001775 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 here so indicate the potential of a misencoded character. */
1777
1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001781 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
1783
1784 if (ch == '-') {
1785 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001786 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 inShift = 1;
1788 }
1789 } else if (SPECIAL(ch,0,0)) {
1790 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 } else {
1793 *p++ = ch;
1794 }
1795 } else {
1796 charsleft = (charsleft << 6) | UB64(ch);
1797 bitsleft += 6;
1798 s++;
1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800 }
1801 }
1802 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 s++;
1805 if (s < e && *s == '-') {
1806 s++;
1807 *p++ = '+';
1808 } else
1809 {
1810 inShift = 1;
1811 bitsleft = 0;
1812 }
1813 }
1814 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001815 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 errmsg = "unexpected special character";
1817 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001818 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820 else {
1821 *p++ = ch;
1822 s++;
1823 }
1824 continue;
1825 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 outpos = p-PyUnicode_AS_UNICODE(unicode);
1827 endinpos = s-starts;
1828 if (unicode_decode_call_errorhandler(
1829 errors, &errorHandler,
1830 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001831 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001832 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 }
1835
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001836 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 outpos = p-PyUnicode_AS_UNICODE(unicode);
1838 endinpos = size;
1839 if (unicode_decode_call_errorhandler(
1840 errors, &errorHandler,
1841 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001843 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 if (s < e)
1846 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001848 if (consumed) {
1849 if(inShift)
1850 *consumed = startinpos;
1851 else
1852 *consumed = s-starts;
1853 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 return (PyObject *)unicode;
1861
1862onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 int encodeSetO,
1873 int encodeWhiteSpace,
1874 const char *errors)
1875{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001876 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 unsigned int bitsleft = 0;
1882 unsigned long charsleft = 0;
1883 char * out;
1884 char * start;
1885
1886 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001887 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001889 if (cbAllocated / 5 != size)
1890 return PyErr_NoMemory();
1891
Christian Heimes9c4756e2008-05-26 13:22:05 +00001892 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (v == NULL)
1894 return NULL;
1895
Christian Heimes9c4756e2008-05-26 13:22:05 +00001896 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 for (;i < size; ++i) {
1898 Py_UNICODE ch = s[i];
1899
1900 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001901 if (ch == '+') {
1902 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *out++ = '-';
1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905 charsleft = ch;
1906 bitsleft = 16;
1907 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 } else {
1911 *out++ = (char) ch;
1912 }
1913 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915 *out++ = B64(charsleft << (6-bitsleft));
1916 charsleft = 0;
1917 bitsleft = 0;
1918 /* Characters not in the BASE64 set implicitly unshift the sequence
1919 so no '-' is required, except if the character is itself a '-' */
1920 if (B64CHAR(ch) || ch == '-') {
1921 *out++ = '-';
1922 }
1923 inShift = 0;
1924 *out++ = (char) ch;
1925 } else {
1926 bitsleft += 16;
1927 charsleft = (charsleft << 16) | ch;
1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001931 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 or '-' then the shift sequence will be terminated implicitly and we
1933 don't have to insert a '-'. */
1934
1935 if (bitsleft == 0) {
1936 if (i + 1 < size) {
1937 Py_UNICODE ch2 = s[i+1];
1938
1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 } else if (B64CHAR(ch2) || ch2 == '-') {
1942 *out++ = '-';
1943 inShift = 0;
1944 } else {
1945 inShift = 0;
1946 }
1947
1948 }
1949 else {
1950 *out++ = '-';
1951 inShift = 0;
1952 }
1953 }
Tim Petersced69f82003-09-16 20:30:58 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (bitsleft) {
1958 *out++= B64(charsleft << (6-bitsleft) );
1959 *out++ = '-';
1960 }
1961
Christian Heimes72b710a2008-05-26 13:28:38 +00001962 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001963 Py_DECREF(v);
1964 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965}
1966
1967#undef SPECIAL
1968#undef B64
1969#undef B64CHAR
1970#undef UB64
1971#undef ENCODE
1972#undef DECODE
1973
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974/* --- UTF-8 Codec -------------------------------------------------------- */
1975
Tim Petersced69f82003-09-16 20:30:58 +00001976static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977char utf8_code_length[256] = {
1978 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1979 illegal prefix. see RFC 2279 for details */
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1994 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1995 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1996};
1997
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 const char *errors)
2001{
Walter Dörwald69652032004-09-07 20:24:22 +00002002 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2003}
2004
2005PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002007 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002008 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002010 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t startinpos;
2013 Py_ssize_t endinpos;
2014 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 const char *e;
2016 PyUnicodeObject *unicode;
2017 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002018 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Note: size will always be longer than the resulting Unicode
2023 character count */
2024 unicode = _PyUnicode_New(size);
2025 if (!unicode)
2026 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002027 if (size == 0) {
2028 if (consumed)
2029 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032
2033 /* Unpack UTF-8 encoded data */
2034 p = unicode->str;
2035 e = s + size;
2036
2037 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039
2040 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002041 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 s++;
2043 continue;
2044 }
2045
2046 n = utf8_code_length[ch];
2047
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002048 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
2050 break;
2051 else {
2052 errmsg = "unexpected end of data";
2053 startinpos = s-starts;
2054 endinpos = size;
2055 goto utf8Error;
2056 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
2059 switch (n) {
2060
2061 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002062 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 startinpos = s-starts;
2064 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002065 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
2067 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002068 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 startinpos = s-starts;
2070 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002071 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
2073 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002074 if ((s[1] & 0xc0) != 0x80) {
2075 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 startinpos = s-starts;
2077 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002078 goto utf8Error;
2079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002081 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 startinpos = s-starts;
2083 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002084 errmsg = "illegal encoding";
2085 goto utf8Error;
2086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002088 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 break;
2090
2091 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002092 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002093 (s[2] & 0xc0) != 0x80) {
2094 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 startinpos = s-starts;
2096 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002097 goto utf8Error;
2098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002100 if (ch < 0x0800) {
2101 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002102 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002103
2104 XXX For wide builds (UCS-4) we should probably try
2105 to recombine the surrogates into a single code
2106 unit.
2107 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002108 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002109 startinpos = s-starts;
2110 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002111 goto utf8Error;
2112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002114 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 break;
2116
2117 case 4:
2118 if ((s[1] & 0xc0) != 0x80 ||
2119 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002120 (s[3] & 0xc0) != 0x80) {
2121 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 startinpos = s-starts;
2123 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002124 goto utf8Error;
2125 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2128 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002130 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002132 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002133 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 startinpos = s-starts;
2136 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 goto utf8Error;
2138 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002139#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002140 *p++ = (Py_UNICODE)ch;
2141#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002143
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002144 /* translate from 10000..10FFFF to 0..FFFF */
2145 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002146
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002147 /* high surrogate = top 10 bits added to D800 */
2148 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002149
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002150 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002151 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002152#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 break;
2154
2155 default:
2156 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002157 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 startinpos = s-starts;
2159 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002160 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 }
2162 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002163 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002165 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 outpos = p-PyUnicode_AS_UNICODE(unicode);
2167 if (unicode_decode_call_errorhandler(
2168 errors, &errorHandler,
2169 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002170 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002171 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 }
Walter Dörwald69652032004-09-07 20:24:22 +00002174 if (consumed)
2175 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176
2177 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002178 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 goto onError;
2180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 Py_XDECREF(errorHandler);
2182 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return (PyObject *)unicode;
2184
2185onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002186 Py_XDECREF(errorHandler);
2187 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 Py_DECREF(unicode);
2189 return NULL;
2190}
2191
Tim Peters602f7402002-04-27 18:03:26 +00002192/* Allocation strategy: if the string is short, convert into a stack buffer
2193 and allocate exactly as much space needed at the end. Else allocate the
2194 maximum possible needed (4 result bytes per Unicode character), and return
2195 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002196*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002197PyObject *
2198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002199 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201{
Tim Peters602f7402002-04-27 18:03:26 +00002202#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002203
Guido van Rossum98297ee2007-11-06 21:34:58 +00002204 Py_ssize_t i; /* index into s of next input byte */
2205 PyObject *result; /* result string object */
2206 char *p; /* next free byte in output buffer */
2207 Py_ssize_t nallocated; /* number of result bytes allocated */
2208 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002209 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002210
Tim Peters602f7402002-04-27 18:03:26 +00002211 assert(s != NULL);
2212 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213
Tim Peters602f7402002-04-27 18:03:26 +00002214 if (size <= MAX_SHORT_UNICHARS) {
2215 /* Write into the stack buffer; nallocated can't overflow.
2216 * At the end, we'll allocate exactly as much heap space as it
2217 * turns out we need.
2218 */
2219 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002220 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002221 p = stackbuf;
2222 }
2223 else {
2224 /* Overallocate on the heap, and give the excess back at the end. */
2225 nallocated = size * 4;
2226 if (nallocated / 4 != size) /* overflow! */
2227 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002228 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002229 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002230 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002231 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002232 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002233
Tim Peters602f7402002-04-27 18:03:26 +00002234 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002235 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002236
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002237 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002238 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002240
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002242 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002243 *p++ = (char)(0xc0 | (ch >> 6));
2244 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002245 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002246 else {
Tim Peters602f7402002-04-27 18:03:26 +00002247 /* Encode UCS2 Unicode ordinals */
2248 if (ch < 0x10000) {
2249 /* Special case: check for high surrogate */
2250 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2251 Py_UCS4 ch2 = s[i];
2252 /* Check for low surrogate and combine the two to
2253 form a UCS4 value */
2254 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002255 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002256 i++;
2257 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002258 }
Tim Peters602f7402002-04-27 18:03:26 +00002259 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002260 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002262 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2263 *p++ = (char)(0x80 | (ch & 0x3f));
2264 continue;
2265 }
2266encodeUCS4:
2267 /* Encode UCS4 Unicode ordinals */
2268 *p++ = (char)(0xf0 | (ch >> 18));
2269 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2270 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2271 *p++ = (char)(0x80 | (ch & 0x3f));
2272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002274
Guido van Rossum98297ee2007-11-06 21:34:58 +00002275 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002276 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002277 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002278 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002279 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002280 }
2281 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002282 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002283 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002284 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002285 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002286 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002287 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002288
Tim Peters602f7402002-04-27 18:03:26 +00002289#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290}
2291
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2293{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 if (!PyUnicode_Check(unicode)) {
2295 PyErr_BadArgument();
2296 return NULL;
2297 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002298 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2299 PyUnicode_GET_SIZE(unicode),
2300 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301}
2302
Walter Dörwald41980ca2007-08-16 21:55:45 +00002303/* --- UTF-32 Codec ------------------------------------------------------- */
2304
2305PyObject *
2306PyUnicode_DecodeUTF32(const char *s,
2307 Py_ssize_t size,
2308 const char *errors,
2309 int *byteorder)
2310{
2311 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2312}
2313
2314PyObject *
2315PyUnicode_DecodeUTF32Stateful(const char *s,
2316 Py_ssize_t size,
2317 const char *errors,
2318 int *byteorder,
2319 Py_ssize_t *consumed)
2320{
2321 const char *starts = s;
2322 Py_ssize_t startinpos;
2323 Py_ssize_t endinpos;
2324 Py_ssize_t outpos;
2325 PyUnicodeObject *unicode;
2326 Py_UNICODE *p;
2327#ifndef Py_UNICODE_WIDE
2328 int i, pairs;
2329#else
2330 const int pairs = 0;
2331#endif
2332 const unsigned char *q, *e;
2333 int bo = 0; /* assume native ordering by default */
2334 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002335 /* Offsets from q for retrieving bytes in the right order. */
2336#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2337 int iorder[] = {0, 1, 2, 3};
2338#else
2339 int iorder[] = {3, 2, 1, 0};
2340#endif
2341 PyObject *errorHandler = NULL;
2342 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002343 /* On narrow builds we split characters outside the BMP into two
2344 codepoints => count how much extra space we need. */
2345#ifndef Py_UNICODE_WIDE
2346 for (i = pairs = 0; i < size/4; i++)
2347 if (((Py_UCS4 *)s)[i] >= 0x10000)
2348 pairs++;
2349#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002350
2351 /* This might be one to much, because of a BOM */
2352 unicode = _PyUnicode_New((size+3)/4+pairs);
2353 if (!unicode)
2354 return NULL;
2355 if (size == 0)
2356 return (PyObject *)unicode;
2357
2358 /* Unpack UTF-32 encoded data */
2359 p = unicode->str;
2360 q = (unsigned char *)s;
2361 e = q + size;
2362
2363 if (byteorder)
2364 bo = *byteorder;
2365
2366 /* Check for BOM marks (U+FEFF) in the input and adjust current
2367 byte order setting accordingly. In native mode, the leading BOM
2368 mark is skipped, in all other modes, it is copied to the output
2369 stream as-is (giving a ZWNBSP character). */
2370 if (bo == 0) {
2371 if (size >= 4) {
2372 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2373 (q[iorder[1]] << 8) | q[iorder[0]];
2374#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375 if (bom == 0x0000FEFF) {
2376 q += 4;
2377 bo = -1;
2378 }
2379 else if (bom == 0xFFFE0000) {
2380 q += 4;
2381 bo = 1;
2382 }
2383#else
2384 if (bom == 0x0000FEFF) {
2385 q += 4;
2386 bo = 1;
2387 }
2388 else if (bom == 0xFFFE0000) {
2389 q += 4;
2390 bo = -1;
2391 }
2392#endif
2393 }
2394 }
2395
2396 if (bo == -1) {
2397 /* force LE */
2398 iorder[0] = 0;
2399 iorder[1] = 1;
2400 iorder[2] = 2;
2401 iorder[3] = 3;
2402 }
2403 else if (bo == 1) {
2404 /* force BE */
2405 iorder[0] = 3;
2406 iorder[1] = 2;
2407 iorder[2] = 1;
2408 iorder[3] = 0;
2409 }
2410
2411 while (q < e) {
2412 Py_UCS4 ch;
2413 /* remaining bytes at the end? (size should be divisible by 4) */
2414 if (e-q<4) {
2415 if (consumed)
2416 break;
2417 errmsg = "truncated data";
2418 startinpos = ((const char *)q)-starts;
2419 endinpos = ((const char *)e)-starts;
2420 goto utf32Error;
2421 /* The remaining input chars are ignored if the callback
2422 chooses to skip the input */
2423 }
2424 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2425 (q[iorder[1]] << 8) | q[iorder[0]];
2426
2427 if (ch >= 0x110000)
2428 {
2429 errmsg = "codepoint not in range(0x110000)";
2430 startinpos = ((const char *)q)-starts;
2431 endinpos = startinpos+4;
2432 goto utf32Error;
2433 }
2434#ifndef Py_UNICODE_WIDE
2435 if (ch >= 0x10000)
2436 {
2437 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2438 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2439 }
2440 else
2441#endif
2442 *p++ = ch;
2443 q += 4;
2444 continue;
2445 utf32Error:
2446 outpos = p-PyUnicode_AS_UNICODE(unicode);
2447 if (unicode_decode_call_errorhandler(
2448 errors, &errorHandler,
2449 "utf32", errmsg,
2450 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002451 &unicode, &outpos, &p))
Walter Dörwald41980ca2007-08-16 21:55:45 +00002452 goto onError;
2453 }
2454
2455 if (byteorder)
2456 *byteorder = bo;
2457
2458 if (consumed)
2459 *consumed = (const char *)q-starts;
2460
2461 /* Adjust length */
2462 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2463 goto onError;
2464
2465 Py_XDECREF(errorHandler);
2466 Py_XDECREF(exc);
2467 return (PyObject *)unicode;
2468
2469onError:
2470 Py_DECREF(unicode);
2471 Py_XDECREF(errorHandler);
2472 Py_XDECREF(exc);
2473 return NULL;
2474}
2475
2476PyObject *
2477PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2478 Py_ssize_t size,
2479 const char *errors,
2480 int byteorder)
2481{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002482 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002484 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002486 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002487#else
2488 const int pairs = 0;
2489#endif
2490 /* Offsets from p for storing byte pairs in the right order. */
2491#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2492 int iorder[] = {0, 1, 2, 3};
2493#else
2494 int iorder[] = {3, 2, 1, 0};
2495#endif
2496
2497#define STORECHAR(CH) \
2498 do { \
2499 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2500 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2501 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2502 p[iorder[0]] = (CH) & 0xff; \
2503 p += 4; \
2504 } while(0)
2505
2506 /* In narrow builds we can output surrogate pairs as one codepoint,
2507 so we need less space. */
2508#ifndef Py_UNICODE_WIDE
2509 for (i = pairs = 0; i < size-1; i++)
2510 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2511 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2512 pairs++;
2513#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002514 nsize = (size - pairs + (byteorder == 0));
2515 bytesize = nsize * 4;
2516 if (bytesize / 4 != nsize)
2517 return PyErr_NoMemory();
2518 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002519 if (v == NULL)
2520 return NULL;
2521
Christian Heimes9c4756e2008-05-26 13:22:05 +00002522 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002523 if (byteorder == 0)
2524 STORECHAR(0xFEFF);
2525 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002527
2528 if (byteorder == -1) {
2529 /* force LE */
2530 iorder[0] = 0;
2531 iorder[1] = 1;
2532 iorder[2] = 2;
2533 iorder[3] = 3;
2534 }
2535 else if (byteorder == 1) {
2536 /* force BE */
2537 iorder[0] = 3;
2538 iorder[1] = 2;
2539 iorder[2] = 1;
2540 iorder[3] = 0;
2541 }
2542
2543 while (size-- > 0) {
2544 Py_UCS4 ch = *s++;
2545#ifndef Py_UNICODE_WIDE
2546 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2547 Py_UCS4 ch2 = *s;
2548 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2549 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2550 s++;
2551 size--;
2552 }
2553 }
2554#endif
2555 STORECHAR(ch);
2556 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002557
2558 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002559 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002560 Py_DECREF(v);
2561 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562#undef STORECHAR
2563}
2564
2565PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2566{
2567 if (!PyUnicode_Check(unicode)) {
2568 PyErr_BadArgument();
2569 return NULL;
2570 }
2571 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2572 PyUnicode_GET_SIZE(unicode),
2573 NULL,
2574 0);
2575}
2576
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577/* --- UTF-16 Codec ------------------------------------------------------- */
2578
Tim Peters772747b2001-08-09 22:21:55 +00002579PyObject *
2580PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002582 const char *errors,
2583 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584{
Walter Dörwald69652032004-09-07 20:24:22 +00002585 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2586}
2587
2588PyObject *
2589PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002590 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002591 const char *errors,
2592 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002593 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002596 Py_ssize_t startinpos;
2597 Py_ssize_t endinpos;
2598 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 PyUnicodeObject *unicode;
2600 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002601 const unsigned char *q, *e;
2602 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002603 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002604 /* Offsets from q for retrieving byte pairs in the right order. */
2605#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2606 int ihi = 1, ilo = 0;
2607#else
2608 int ihi = 0, ilo = 1;
2609#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 PyObject *errorHandler = NULL;
2611 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
2613 /* Note: size will always be longer than the resulting Unicode
2614 character count */
2615 unicode = _PyUnicode_New(size);
2616 if (!unicode)
2617 return NULL;
2618 if (size == 0)
2619 return (PyObject *)unicode;
2620
2621 /* Unpack UTF-16 encoded data */
2622 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002623 q = (unsigned char *)s;
2624 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625
2626 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002627 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002629 /* Check for BOM marks (U+FEFF) in the input and adjust current
2630 byte order setting accordingly. In native mode, the leading BOM
2631 mark is skipped, in all other modes, it is copied to the output
2632 stream as-is (giving a ZWNBSP character). */
2633 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002634 if (size >= 2) {
2635 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002636#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002637 if (bom == 0xFEFF) {
2638 q += 2;
2639 bo = -1;
2640 }
2641 else if (bom == 0xFFFE) {
2642 q += 2;
2643 bo = 1;
2644 }
Tim Petersced69f82003-09-16 20:30:58 +00002645#else
Walter Dörwald69652032004-09-07 20:24:22 +00002646 if (bom == 0xFEFF) {
2647 q += 2;
2648 bo = 1;
2649 }
2650 else if (bom == 0xFFFE) {
2651 q += 2;
2652 bo = -1;
2653 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002654#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002655 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657
Tim Peters772747b2001-08-09 22:21:55 +00002658 if (bo == -1) {
2659 /* force LE */
2660 ihi = 1;
2661 ilo = 0;
2662 }
2663 else if (bo == 1) {
2664 /* force BE */
2665 ihi = 0;
2666 ilo = 1;
2667 }
2668
2669 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002671 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002673 if (consumed)
2674 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 errmsg = "truncated data";
2676 startinpos = ((const char *)q)-starts;
2677 endinpos = ((const char *)e)-starts;
2678 goto utf16Error;
2679 /* The remaining input chars are ignored if the callback
2680 chooses to skip the input */
2681 }
2682 ch = (q[ihi] << 8) | q[ilo];
2683
Tim Peters772747b2001-08-09 22:21:55 +00002684 q += 2;
2685
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 if (ch < 0xD800 || ch > 0xDFFF) {
2687 *p++ = ch;
2688 continue;
2689 }
2690
2691 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002692 if (q >= e) {
2693 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 startinpos = (((const char *)q)-2)-starts;
2695 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002696 goto utf16Error;
2697 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002698 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002699 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2700 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002701 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002702#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002703 *p++ = ch;
2704 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#else
2706 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002707#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002709 }
2710 else {
2711 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 startinpos = (((const char *)q)-4)-starts;
2713 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002714 goto utf16Error;
2715 }
2716
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002718 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 startinpos = (((const char *)q)-2)-starts;
2720 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002721 /* Fall through to report the error */
2722
2723 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 outpos = p-PyUnicode_AS_UNICODE(unicode);
2725 if (unicode_decode_call_errorhandler(
2726 errors, &errorHandler,
2727 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002728 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002729 &unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 }
2732
2733 if (byteorder)
2734 *byteorder = bo;
2735
Walter Dörwald69652032004-09-07 20:24:22 +00002736 if (consumed)
2737 *consumed = (const char *)q-starts;
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002740 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 goto onError;
2742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 return (PyObject *)unicode;
2746
2747onError:
2748 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 return NULL;
2752}
2753
Tim Peters772747b2001-08-09 22:21:55 +00002754PyObject *
2755PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002756 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002757 const char *errors,
2758 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002760 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002761 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002762 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002763#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002764 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002765#else
2766 const int pairs = 0;
2767#endif
Tim Peters772747b2001-08-09 22:21:55 +00002768 /* Offsets from p for storing byte pairs in the right order. */
2769#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2770 int ihi = 1, ilo = 0;
2771#else
2772 int ihi = 0, ilo = 1;
2773#endif
2774
2775#define STORECHAR(CH) \
2776 do { \
2777 p[ihi] = ((CH) >> 8) & 0xff; \
2778 p[ilo] = (CH) & 0xff; \
2779 p += 2; \
2780 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002782#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002783 for (i = pairs = 0; i < size; i++)
2784 if (s[i] >= 0x10000)
2785 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002786#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002787 /* 2 * (size + pairs + (byteorder == 0)) */
2788 if (size > PY_SSIZE_T_MAX ||
2789 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2790 return PyErr_NoMemory();
2791 nsize = size + pairs + (byteorder == 0);
2792 bytesize = nsize * 2;
2793 if (bytesize / 2 != nsize)
2794 return PyErr_NoMemory();
2795 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 if (v == NULL)
2797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Christian Heimes9c4756e2008-05-26 13:22:05 +00002799 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002801 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002802 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002803 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002804
2805 if (byteorder == -1) {
2806 /* force LE */
2807 ihi = 1;
2808 ilo = 0;
2809 }
2810 else if (byteorder == 1) {
2811 /* force BE */
2812 ihi = 0;
2813 ilo = 1;
2814 }
2815
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002816 while (size-- > 0) {
2817 Py_UNICODE ch = *s++;
2818 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002819#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002820 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002821 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2822 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002824#endif
Tim Peters772747b2001-08-09 22:21:55 +00002825 STORECHAR(ch);
2826 if (ch2)
2827 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002828 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829
2830 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002831 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002832 Py_DECREF(v);
2833 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002834#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835}
2836
2837PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2838{
2839 if (!PyUnicode_Check(unicode)) {
2840 PyErr_BadArgument();
2841 return NULL;
2842 }
2843 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2844 PyUnicode_GET_SIZE(unicode),
2845 NULL,
2846 0);
2847}
2848
2849/* --- Unicode Escape Codec ----------------------------------------------- */
2850
Fredrik Lundh06d12682001-01-24 07:59:11 +00002851static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002852
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 const char *errors)
2856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 Py_ssize_t startinpos;
2859 Py_ssize_t endinpos;
2860 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002865 char* message;
2866 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 PyObject *errorHandler = NULL;
2868 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002869
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 /* Escaped strings will always be longer than the resulting
2871 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 length after conversion to the true value.
2873 (but if the error callback returns a long replacement string
2874 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 v = _PyUnicode_New(size);
2876 if (v == NULL)
2877 goto onError;
2878 if (size == 0)
2879 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 while (s < end) {
2885 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002886 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888
2889 /* Non-escape characters are interpreted as Unicode ordinals */
2890 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002891 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 continue;
2893 }
2894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 /* \ - Escapes */
2897 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002898 c = *s++;
2899 if (s > end)
2900 c = '\0'; /* Invalid after \ */
2901 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902
2903 /* \x escapes */
2904 case '\n': break;
2905 case '\\': *p++ = '\\'; break;
2906 case '\'': *p++ = '\''; break;
2907 case '\"': *p++ = '\"'; break;
2908 case 'b': *p++ = '\b'; break;
2909 case 'f': *p++ = '\014'; break; /* FF */
2910 case 't': *p++ = '\t'; break;
2911 case 'n': *p++ = '\n'; break;
2912 case 'r': *p++ = '\r'; break;
2913 case 'v': *p++ = '\013'; break; /* VT */
2914 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2915
2916 /* \OOO (octal) escapes */
2917 case '0': case '1': case '2': case '3':
2918 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002919 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002920 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002921 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002922 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002923 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002925 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 break;
2927
Fredrik Lundhccc74732001-02-18 22:13:49 +00002928 /* hex escapes */
2929 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002931 digits = 2;
2932 message = "truncated \\xXX escape";
2933 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002937 digits = 4;
2938 message = "truncated \\uXXXX escape";
2939 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940
Fredrik Lundhccc74732001-02-18 22:13:49 +00002941 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002942 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002943 digits = 8;
2944 message = "truncated \\UXXXXXXXX escape";
2945 hexescape:
2946 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 outpos = p-PyUnicode_AS_UNICODE(v);
2948 if (s+digits>end) {
2949 endinpos = size;
2950 if (unicode_decode_call_errorhandler(
2951 errors, &errorHandler,
2952 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002953 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002954 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 goto onError;
2956 goto nextByte;
2957 }
2958 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002959 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002960 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 endinpos = (s+i+1)-starts;
2962 if (unicode_decode_call_errorhandler(
2963 errors, &errorHandler,
2964 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002965 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002966 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002969 }
2970 chr = (chr<<4) & ~0xF;
2971 if (c >= '0' && c <= '9')
2972 chr += c - '0';
2973 else if (c >= 'a' && c <= 'f')
2974 chr += 10 + c - 'a';
2975 else
2976 chr += 10 + c - 'A';
2977 }
2978 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002979 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 /* _decoding_error will have already written into the
2981 target buffer. */
2982 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002983 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002984 /* when we get here, chr is a 32-bit unicode character */
2985 if (chr <= 0xffff)
2986 /* UCS-2 character */
2987 *p++ = (Py_UNICODE) chr;
2988 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002989 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002990 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002991#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002992 *p++ = chr;
2993#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002994 chr -= 0x10000L;
2995 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002996 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002997#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002998 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 endinpos = s-starts;
3000 outpos = p-PyUnicode_AS_UNICODE(v);
3001 if (unicode_decode_call_errorhandler(
3002 errors, &errorHandler,
3003 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003004 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003005 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003006 goto onError;
3007 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003008 break;
3009
3010 /* \N{name} */
3011 case 'N':
3012 message = "malformed \\N character escape";
3013 if (ucnhash_CAPI == NULL) {
3014 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003015 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003016 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003017 if (m == NULL)
3018 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003019 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003020 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003021 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003022 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003023 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003024 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003025 if (ucnhash_CAPI == NULL)
3026 goto ucnhashError;
3027 }
3028 if (*s == '{') {
3029 const char *start = s+1;
3030 /* look for the closing brace */
3031 while (*s != '}' && s < end)
3032 s++;
3033 if (s > start && s < end && *s == '}') {
3034 /* found a name. look it up in the unicode database */
3035 message = "unknown Unicode character name";
3036 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003037 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003038 goto store;
3039 }
3040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 endinpos = s-starts;
3042 outpos = p-PyUnicode_AS_UNICODE(v);
3043 if (unicode_decode_call_errorhandler(
3044 errors, &errorHandler,
3045 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003046 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003047 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003048 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003049 break;
3050
3051 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003052 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 message = "\\ at end of string";
3054 s--;
3055 endinpos = s-starts;
3056 outpos = p-PyUnicode_AS_UNICODE(v);
3057 if (unicode_decode_call_errorhandler(
3058 errors, &errorHandler,
3059 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003060 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003061 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003062 goto onError;
3063 }
3064 else {
3065 *p++ = '\\';
3066 *p++ = (unsigned char)s[-1];
3067 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003068 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 nextByte:
3071 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003073 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003075 Py_XDECREF(errorHandler);
3076 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003078
Fredrik Lundhccc74732001-02-18 22:13:49 +00003079ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003080 PyErr_SetString(
3081 PyExc_UnicodeError,
3082 "\\N escapes not supported (can't load unicodedata module)"
3083 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003084 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 Py_XDECREF(errorHandler);
3086 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003087 return NULL;
3088
Fredrik Lundhccc74732001-02-18 22:13:49 +00003089onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 Py_XDECREF(errorHandler);
3092 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 return NULL;
3094}
3095
3096/* Return a Unicode-Escape string version of the Unicode object.
3097
3098 If quotes is true, the string is enclosed in u"" or u'' quotes as
3099 appropriate.
3100
3101*/
3102
Thomas Wouters477c8d52006-05-27 19:21:47 +00003103Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3104 Py_ssize_t size,
3105 Py_UNICODE ch)
3106{
3107 /* like wcschr, but doesn't stop at NULL characters */
3108
3109 while (size-- > 0) {
3110 if (*s == ch)
3111 return s;
3112 s++;
3113 }
3114
3115 return NULL;
3116}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003117
Walter Dörwald79e913e2007-05-12 11:08:06 +00003118static const char *hexdigits = "0123456789abcdef";
3119
3120PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3121 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003123 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003126#ifdef Py_UNICODE_WIDE
3127 const Py_ssize_t expandsize = 10;
3128#else
3129 const Py_ssize_t expandsize = 6;
3130#endif
3131
Thomas Wouters89f507f2006-12-13 04:49:30 +00003132 /* XXX(nnorwitz): rather than over-allocating, it would be
3133 better to choose a different scheme. Perhaps scan the
3134 first N-chars of the string and allocate based on that size.
3135 */
3136 /* Initial allocation is based on the longest-possible unichr
3137 escape.
3138
3139 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3140 unichr, so in this case it's the longest unichr escape. In
3141 narrow (UTF-16) builds this is five chars per source unichr
3142 since there are two unichrs in the surrogate pair, so in narrow
3143 (UTF-16) builds it's not the longest unichr escape.
3144
3145 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3146 so in the narrow (UTF-16) build case it's the longest unichr
3147 escape.
3148 */
3149
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003150 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3151 return PyErr_NoMemory();
3152
Christian Heimes9c4756e2008-05-26 13:22:05 +00003153 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003154 2
3155 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003156 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (repr == NULL)
3158 return NULL;
3159
Christian Heimes9c4756e2008-05-26 13:22:05 +00003160 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 while (size-- > 0) {
3163 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003164
Walter Dörwald79e913e2007-05-12 11:08:06 +00003165 /* Escape backslashes */
3166 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 *p++ = '\\';
3168 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003169 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003170 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003171
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003172#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003173 /* Map 21-bit characters to '\U00xxxxxx' */
3174 else if (ch >= 0x10000) {
3175 *p++ = '\\';
3176 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003177 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3178 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3179 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3180 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3181 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3182 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3183 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3184 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003185 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003186 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003187#else
3188 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003189 else if (ch >= 0xD800 && ch < 0xDC00) {
3190 Py_UNICODE ch2;
3191 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003192
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003193 ch2 = *s++;
3194 size--;
3195 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3196 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3197 *p++ = '\\';
3198 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003199 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3200 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3201 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3202 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3203 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3204 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3205 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3206 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003207 continue;
3208 }
3209 /* Fall through: isolated surrogates are copied as-is */
3210 s--;
3211 size++;
3212 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003213#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003216 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 *p++ = '\\';
3218 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003219 *p++ = hexdigits[(ch >> 12) & 0x000F];
3220 *p++ = hexdigits[(ch >> 8) & 0x000F];
3221 *p++ = hexdigits[(ch >> 4) & 0x000F];
3222 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003225 /* Map special whitespace to '\t', \n', '\r' */
3226 else if (ch == '\t') {
3227 *p++ = '\\';
3228 *p++ = 't';
3229 }
3230 else if (ch == '\n') {
3231 *p++ = '\\';
3232 *p++ = 'n';
3233 }
3234 else if (ch == '\r') {
3235 *p++ = '\\';
3236 *p++ = 'r';
3237 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003238
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003239 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003240 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003242 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003243 *p++ = hexdigits[(ch >> 4) & 0x000F];
3244 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003245 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003246
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 /* Copy everything else as-is */
3248 else
3249 *p++ = (char) ch;
3250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251
Christian Heimes72b710a2008-05-26 13:28:38 +00003252 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003253 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003254 Py_DECREF(repr);
3255 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256}
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3259{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003260 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 return NULL;
3264 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003265 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3266 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003267 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268}
3269
3270/* --- Raw Unicode Escape Codec ------------------------------------------- */
3271
3272PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003273 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 const char *errors)
3275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t startinpos;
3278 Py_ssize_t endinpos;
3279 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 const char *end;
3283 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 PyObject *errorHandler = NULL;
3285 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003286
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 /* Escaped strings will always be longer than the resulting
3288 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 length after conversion to the true value. (But decoding error
3290 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 v = _PyUnicode_New(size);
3292 if (v == NULL)
3293 goto onError;
3294 if (size == 0)
3295 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 end = s + size;
3298 while (s < end) {
3299 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003300 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003302 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
3304 /* Non-escape characters are interpreted as Unicode ordinals */
3305 if (*s != '\\') {
3306 *p++ = (unsigned char)*s++;
3307 continue;
3308 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
3311 /* \u-escapes are only interpreted iff the number of leading
3312 backslashes if odd */
3313 bs = s;
3314 for (;s < end;) {
3315 if (*s != '\\')
3316 break;
3317 *p++ = (unsigned char)*s++;
3318 }
3319 if (((s - bs) & 1) == 0 ||
3320 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003321 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 continue;
3323 }
3324 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 s++;
3327
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003328 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003330 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 endinpos = s-starts;
3334 if (unicode_decode_call_errorhandler(
3335 errors, &errorHandler,
3336 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003337 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003338 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
3342 x = (x<<4) & ~0xF;
3343 if (c >= '0' && c <= '9')
3344 x += c - '0';
3345 else if (c >= 'a' && c <= 'f')
3346 x += 10 + c - 'a';
3347 else
3348 x += 10 + c - 'A';
3349 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003350 if (x <= 0xffff)
3351 /* UCS-2 character */
3352 *p++ = (Py_UNICODE) x;
3353 else if (x <= 0x10ffff) {
3354 /* UCS-4 character. Either store directly, or as
3355 surrogate pair. */
3356#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003357 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003358#else
3359 x -= 0x10000L;
3360 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3361 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3362#endif
3363 } else {
3364 endinpos = s-starts;
3365 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003366 if (unicode_decode_call_errorhandler(
3367 errors, &errorHandler,
3368 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003369 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003370 &v, &outpos, &p))
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003371 goto onError;
3372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373 nextByte:
3374 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003376 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003377 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_XDECREF(errorHandler);
3379 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 onError:
3383 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 Py_XDECREF(errorHandler);
3385 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 return NULL;
3387}
3388
3389PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003390 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003392 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 char *p;
3394 char *q;
3395
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003396#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003397 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003398#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003399 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003400#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003401
3402 if (size > PY_SSIZE_T_MAX / expandsize)
3403 return PyErr_NoMemory();
3404
3405 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 if (repr == NULL)
3407 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003408 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003409 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410
Christian Heimes9c4756e2008-05-26 13:22:05 +00003411 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 while (size-- > 0) {
3413 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003414#ifdef Py_UNICODE_WIDE
3415 /* Map 32-bit characters to '\Uxxxxxxxx' */
3416 if (ch >= 0x10000) {
3417 *p++ = '\\';
3418 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003419 *p++ = hexdigits[(ch >> 28) & 0xf];
3420 *p++ = hexdigits[(ch >> 24) & 0xf];
3421 *p++ = hexdigits[(ch >> 20) & 0xf];
3422 *p++ = hexdigits[(ch >> 16) & 0xf];
3423 *p++ = hexdigits[(ch >> 12) & 0xf];
3424 *p++ = hexdigits[(ch >> 8) & 0xf];
3425 *p++ = hexdigits[(ch >> 4) & 0xf];
3426 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003427 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003428 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003429#else
3430 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3431 if (ch >= 0xD800 && ch < 0xDC00) {
3432 Py_UNICODE ch2;
3433 Py_UCS4 ucs;
3434
3435 ch2 = *s++;
3436 size--;
3437 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3438 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3439 *p++ = '\\';
3440 *p++ = 'U';
3441 *p++ = hexdigits[(ucs >> 28) & 0xf];
3442 *p++ = hexdigits[(ucs >> 24) & 0xf];
3443 *p++ = hexdigits[(ucs >> 20) & 0xf];
3444 *p++ = hexdigits[(ucs >> 16) & 0xf];
3445 *p++ = hexdigits[(ucs >> 12) & 0xf];
3446 *p++ = hexdigits[(ucs >> 8) & 0xf];
3447 *p++ = hexdigits[(ucs >> 4) & 0xf];
3448 *p++ = hexdigits[ucs & 0xf];
3449 continue;
3450 }
3451 /* Fall through: isolated surrogates are copied as-is */
3452 s--;
3453 size++;
3454 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003455#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 /* Map 16-bit characters to '\uxxxx' */
3457 if (ch >= 256) {
3458 *p++ = '\\';
3459 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003460 *p++ = hexdigits[(ch >> 12) & 0xf];
3461 *p++ = hexdigits[(ch >> 8) & 0xf];
3462 *p++ = hexdigits[(ch >> 4) & 0xf];
3463 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 }
3465 /* Copy everything else as-is */
3466 else
3467 *p++ = (char) ch;
3468 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003469 size = p - q;
3470
3471 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003472 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003473 Py_DECREF(repr);
3474 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475}
3476
3477PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3478{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003479 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003481 PyErr_BadArgument();
3482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003484 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3485 PyUnicode_GET_SIZE(unicode));
3486
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003487 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488}
3489
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003490/* --- Unicode Internal Codec ------------------------------------------- */
3491
3492PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003493 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003494 const char *errors)
3495{
3496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 Py_ssize_t startinpos;
3498 Py_ssize_t endinpos;
3499 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003500 PyUnicodeObject *v;
3501 Py_UNICODE *p;
3502 const char *end;
3503 const char *reason;
3504 PyObject *errorHandler = NULL;
3505 PyObject *exc = NULL;
3506
Neal Norwitzd43069c2006-01-08 01:12:10 +00003507#ifdef Py_UNICODE_WIDE
3508 Py_UNICODE unimax = PyUnicode_GetMax();
3509#endif
3510
Thomas Wouters89f507f2006-12-13 04:49:30 +00003511 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003512 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3513 if (v == NULL)
3514 goto onError;
3515 if (PyUnicode_GetSize((PyObject *)v) == 0)
3516 return (PyObject *)v;
3517 p = PyUnicode_AS_UNICODE(v);
3518 end = s + size;
3519
3520 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003521 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003522 /* We have to sanity check the raw data, otherwise doom looms for
3523 some malformed UCS-4 data. */
3524 if (
3525 #ifdef Py_UNICODE_WIDE
3526 *p > unimax || *p < 0 ||
3527 #endif
3528 end-s < Py_UNICODE_SIZE
3529 )
3530 {
3531 startinpos = s - starts;
3532 if (end-s < Py_UNICODE_SIZE) {
3533 endinpos = end-starts;
3534 reason = "truncated input";
3535 }
3536 else {
3537 endinpos = s - starts + Py_UNICODE_SIZE;
3538 reason = "illegal code point (> 0x10FFFF)";
3539 }
3540 outpos = p - PyUnicode_AS_UNICODE(v);
3541 if (unicode_decode_call_errorhandler(
3542 errors, &errorHandler,
3543 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003544 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003545 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003546 goto onError;
3547 }
3548 }
3549 else {
3550 p++;
3551 s += Py_UNICODE_SIZE;
3552 }
3553 }
3554
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003555 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003556 goto onError;
3557 Py_XDECREF(errorHandler);
3558 Py_XDECREF(exc);
3559 return (PyObject *)v;
3560
3561 onError:
3562 Py_XDECREF(v);
3563 Py_XDECREF(errorHandler);
3564 Py_XDECREF(exc);
3565 return NULL;
3566}
3567
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568/* --- Latin-1 Codec ------------------------------------------------------ */
3569
3570PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003571 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 const char *errors)
3573{
3574 PyUnicodeObject *v;
3575 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003578 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003579 Py_UNICODE r = *(unsigned char*)s;
3580 return PyUnicode_FromUnicode(&r, 1);
3581 }
3582
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 v = _PyUnicode_New(size);
3584 if (v == NULL)
3585 goto onError;
3586 if (size == 0)
3587 return (PyObject *)v;
3588 p = PyUnicode_AS_UNICODE(v);
3589 while (size-- > 0)
3590 *p++ = (unsigned char)*s++;
3591 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003592
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 onError:
3594 Py_XDECREF(v);
3595 return NULL;
3596}
3597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598/* create or adjust a UnicodeEncodeError */
3599static void make_encode_exception(PyObject **exceptionObject,
3600 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 const Py_UNICODE *unicode, Py_ssize_t size,
3602 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 if (*exceptionObject == NULL) {
3606 *exceptionObject = PyUnicodeEncodeError_Create(
3607 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 }
3609 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3611 goto onError;
3612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3613 goto onError;
3614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3615 goto onError;
3616 return;
3617 onError:
3618 Py_DECREF(*exceptionObject);
3619 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621}
3622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623/* raises a UnicodeEncodeError */
3624static void raise_encode_exception(PyObject **exceptionObject,
3625 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 const Py_UNICODE *unicode, Py_ssize_t size,
3627 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 const char *reason)
3629{
3630 make_encode_exception(exceptionObject,
3631 encoding, unicode, size, startpos, endpos, reason);
3632 if (*exceptionObject != NULL)
3633 PyCodec_StrictErrors(*exceptionObject);
3634}
3635
3636/* error handling callback helper:
3637 build arguments, call the callback and check the arguments,
3638 put the result into newpos and return the replacement string, which
3639 has to be freed by the caller */
3640static PyObject *unicode_encode_call_errorhandler(const char *errors,
3641 PyObject **errorHandler,
3642 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003643 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3644 Py_ssize_t startpos, Py_ssize_t endpos,
3645 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003647 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648
3649 PyObject *restuple;
3650 PyObject *resunicode;
3651
3652 if (*errorHandler == NULL) {
3653 *errorHandler = PyCodec_LookupError(errors);
3654 if (*errorHandler == NULL)
3655 return NULL;
3656 }
3657
3658 make_encode_exception(exceptionObject,
3659 encoding, unicode, size, startpos, endpos, reason);
3660 if (*exceptionObject == NULL)
3661 return NULL;
3662
3663 restuple = PyObject_CallFunctionObjArgs(
3664 *errorHandler, *exceptionObject, NULL);
3665 if (restuple == NULL)
3666 return NULL;
3667 if (!PyTuple_Check(restuple)) {
3668 PyErr_Format(PyExc_TypeError, &argparse[4]);
3669 Py_DECREF(restuple);
3670 return NULL;
3671 }
3672 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3673 &resunicode, newpos)) {
3674 Py_DECREF(restuple);
3675 return NULL;
3676 }
3677 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003678 *newpos = size+*newpos;
3679 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003680 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003681 Py_DECREF(restuple);
3682 return NULL;
3683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_INCREF(resunicode);
3685 Py_DECREF(restuple);
3686 return resunicode;
3687}
3688
3689static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003690 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 const char *errors,
3692 int limit)
3693{
3694 /* output object */
3695 PyObject *res;
3696 /* pointers to the beginning and end+1 of input */
3697 const Py_UNICODE *startp = p;
3698 const Py_UNICODE *endp = p + size;
3699 /* pointer to the beginning of the unencodable characters */
3700 /* const Py_UNICODE *badp = NULL; */
3701 /* pointer into the output */
3702 char *str;
3703 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003704 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003705 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3706 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 PyObject *errorHandler = NULL;
3708 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003709 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* the following variable is used for caching string comparisons
3711 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3712 int known_errorHandler = -1;
3713
3714 /* allocate enough for a simple encoding without
3715 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003716 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003717 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003718 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003720 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003721 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 ressize = size;
3723
3724 while (p<endp) {
3725 Py_UNICODE c = *p;
3726
3727 /* can we encode this? */
3728 if (c<limit) {
3729 /* no overflow check, because we know that the space is enough */
3730 *str++ = (char)c;
3731 ++p;
3732 }
3733 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003734 Py_ssize_t unicodepos = p-startp;
3735 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t repsize;
3738 Py_ssize_t newpos;
3739 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 Py_UNICODE *uni2;
3741 /* startpos for collecting unencodable chars */
3742 const Py_UNICODE *collstart = p;
3743 const Py_UNICODE *collend = p;
3744 /* find all unecodable characters */
3745 while ((collend < endp) && ((*collend)>=limit))
3746 ++collend;
3747 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3748 if (known_errorHandler==-1) {
3749 if ((errors==NULL) || (!strcmp(errors, "strict")))
3750 known_errorHandler = 1;
3751 else if (!strcmp(errors, "replace"))
3752 known_errorHandler = 2;
3753 else if (!strcmp(errors, "ignore"))
3754 known_errorHandler = 3;
3755 else if (!strcmp(errors, "xmlcharrefreplace"))
3756 known_errorHandler = 4;
3757 else
3758 known_errorHandler = 0;
3759 }
3760 switch (known_errorHandler) {
3761 case 1: /* strict */
3762 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3763 goto onError;
3764 case 2: /* replace */
3765 while (collstart++<collend)
3766 *str++ = '?'; /* fall through */
3767 case 3: /* ignore */
3768 p = collend;
3769 break;
3770 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003771 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 /* determine replacement size (temporarily (mis)uses p) */
3773 for (p = collstart, repsize = 0; p < collend; ++p) {
3774 if (*p<10)
3775 repsize += 2+1+1;
3776 else if (*p<100)
3777 repsize += 2+2+1;
3778 else if (*p<1000)
3779 repsize += 2+3+1;
3780 else if (*p<10000)
3781 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003782#ifndef Py_UNICODE_WIDE
3783 else
3784 repsize += 2+5+1;
3785#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 else if (*p<100000)
3787 repsize += 2+5+1;
3788 else if (*p<1000000)
3789 repsize += 2+6+1;
3790 else
3791 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003792#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 }
3794 requiredsize = respos+repsize+(endp-collend);
3795 if (requiredsize > ressize) {
3796 if (requiredsize<2*ressize)
3797 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003798 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003800 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 ressize = requiredsize;
3802 }
3803 /* generate replacement (temporarily (mis)uses p) */
3804 for (p = collstart; p < collend; ++p) {
3805 str += sprintf(str, "&#%d;", (int)*p);
3806 }
3807 p = collend;
3808 break;
3809 default:
3810 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3811 encoding, reason, startp, size, &exc,
3812 collstart-startp, collend-startp, &newpos);
3813 if (repunicode == NULL)
3814 goto onError;
3815 /* need more space? (at least enough for what we
3816 have+the replacement+the rest of the string, so
3817 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003818 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 repsize = PyUnicode_GET_SIZE(repunicode);
3820 requiredsize = respos+repsize+(endp-collend);
3821 if (requiredsize > ressize) {
3822 if (requiredsize<2*ressize)
3823 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003824 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 Py_DECREF(repunicode);
3826 goto onError;
3827 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003828 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 ressize = requiredsize;
3830 }
3831 /* check if there is anything unencodable in the replacement
3832 and copy it to the output */
3833 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3834 c = *uni2;
3835 if (c >= limit) {
3836 raise_encode_exception(&exc, encoding, startp, size,
3837 unicodepos, unicodepos+1, reason);
3838 Py_DECREF(repunicode);
3839 goto onError;
3840 }
3841 *str = (char)c;
3842 }
3843 p = startp + newpos;
3844 Py_DECREF(repunicode);
3845 }
3846 }
3847 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003848 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003849 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003850 onError:
3851 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 Py_XDECREF(errorHandler);
3853 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003854 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855}
3856
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003858 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 const char *errors)
3860{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862}
3863
3864PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3865{
3866 if (!PyUnicode_Check(unicode)) {
3867 PyErr_BadArgument();
3868 return NULL;
3869 }
3870 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3871 PyUnicode_GET_SIZE(unicode),
3872 NULL);
3873}
3874
3875/* --- 7-bit ASCII Codec -------------------------------------------------- */
3876
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 const char *errors)
3880{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 PyUnicodeObject *v;
3883 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003884 Py_ssize_t startinpos;
3885 Py_ssize_t endinpos;
3886 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 const char *e;
3888 PyObject *errorHandler = NULL;
3889 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003892 if (size == 1 && *(unsigned char*)s < 128) {
3893 Py_UNICODE r = *(unsigned char*)s;
3894 return PyUnicode_FromUnicode(&r, 1);
3895 }
Tim Petersced69f82003-09-16 20:30:58 +00003896
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 v = _PyUnicode_New(size);
3898 if (v == NULL)
3899 goto onError;
3900 if (size == 0)
3901 return (PyObject *)v;
3902 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 e = s + size;
3904 while (s < e) {
3905 register unsigned char c = (unsigned char)*s;
3906 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 ++s;
3909 }
3910 else {
3911 startinpos = s-starts;
3912 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003913 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 if (unicode_decode_call_errorhandler(
3915 errors, &errorHandler,
3916 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003917 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003918 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003922 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003923 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003924 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 Py_XDECREF(errorHandler);
3926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003928
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 onError:
3930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 Py_XDECREF(errorHandler);
3932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 return NULL;
3934}
3935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 const char *errors)
3939{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941}
3942
3943PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3944{
3945 if (!PyUnicode_Check(unicode)) {
3946 PyErr_BadArgument();
3947 return NULL;
3948 }
3949 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3950 PyUnicode_GET_SIZE(unicode),
3951 NULL);
3952}
3953
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003954#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003955
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003956/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003958#if SIZEOF_INT < SIZEOF_SSIZE_T
3959#define NEED_RETRY
3960#endif
3961
3962/* XXX This code is limited to "true" double-byte encodings, as
3963 a) it assumes an incomplete character consists of a single byte, and
3964 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3965 encodings, see IsDBCSLeadByteEx documentation. */
3966
3967static int is_dbcs_lead_byte(const char *s, int offset)
3968{
3969 const char *curr = s + offset;
3970
3971 if (IsDBCSLeadByte(*curr)) {
3972 const char *prev = CharPrev(s, curr);
3973 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3974 }
3975 return 0;
3976}
3977
3978/*
3979 * Decode MBCS string into unicode object. If 'final' is set, converts
3980 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3981 */
3982static int decode_mbcs(PyUnicodeObject **v,
3983 const char *s, /* MBCS string */
3984 int size, /* sizeof MBCS string */
3985 int final)
3986{
3987 Py_UNICODE *p;
3988 Py_ssize_t n = 0;
3989 int usize = 0;
3990
3991 assert(size >= 0);
3992
3993 /* Skip trailing lead-byte unless 'final' is set */
3994 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3995 --size;
3996
3997 /* First get the size of the result */
3998 if (size > 0) {
3999 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4000 if (usize == 0) {
4001 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4002 return -1;
4003 }
4004 }
4005
4006 if (*v == NULL) {
4007 /* Create unicode object */
4008 *v = _PyUnicode_New(usize);
4009 if (*v == NULL)
4010 return -1;
4011 }
4012 else {
4013 /* Extend unicode object */
4014 n = PyUnicode_GET_SIZE(*v);
4015 if (_PyUnicode_Resize(v, n + usize) < 0)
4016 return -1;
4017 }
4018
4019 /* Do the conversion */
4020 if (size > 0) {
4021 p = PyUnicode_AS_UNICODE(*v) + n;
4022 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4023 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4024 return -1;
4025 }
4026 }
4027
4028 return size;
4029}
4030
4031PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4032 Py_ssize_t size,
4033 const char *errors,
4034 Py_ssize_t *consumed)
4035{
4036 PyUnicodeObject *v = NULL;
4037 int done;
4038
4039 if (consumed)
4040 *consumed = 0;
4041
4042#ifdef NEED_RETRY
4043 retry:
4044 if (size > INT_MAX)
4045 done = decode_mbcs(&v, s, INT_MAX, 0);
4046 else
4047#endif
4048 done = decode_mbcs(&v, s, (int)size, !consumed);
4049
4050 if (done < 0) {
4051 Py_XDECREF(v);
4052 return NULL;
4053 }
4054
4055 if (consumed)
4056 *consumed += done;
4057
4058#ifdef NEED_RETRY
4059 if (size > INT_MAX) {
4060 s += done;
4061 size -= done;
4062 goto retry;
4063 }
4064#endif
4065
4066 return (PyObject *)v;
4067}
4068
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004069PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004071 const char *errors)
4072{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004073 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4074}
4075
4076/*
4077 * Convert unicode into string object (MBCS).
4078 * Returns 0 if succeed, -1 otherwise.
4079 */
4080static int encode_mbcs(PyObject **repr,
4081 const Py_UNICODE *p, /* unicode */
4082 int size) /* size of unicode */
4083{
4084 int mbcssize = 0;
4085 Py_ssize_t n = 0;
4086
4087 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004088
4089 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004090 if (size > 0) {
4091 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4092 if (mbcssize == 0) {
4093 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4094 return -1;
4095 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004096 }
4097
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004098 if (*repr == NULL) {
4099 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004100 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004101 if (*repr == NULL)
4102 return -1;
4103 }
4104 else {
4105 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004106 n = PyBytes_Size(*repr);
4107 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004108 return -1;
4109 }
4110
4111 /* Do the conversion */
4112 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004113 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004114 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4115 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4116 return -1;
4117 }
4118 }
4119
4120 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004121}
4122
4123PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004125 const char *errors)
4126{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004127 PyObject *repr = NULL;
4128 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004129
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004130#ifdef NEED_RETRY
4131 retry:
4132 if (size > INT_MAX)
4133 ret = encode_mbcs(&repr, p, INT_MAX);
4134 else
4135#endif
4136 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004138 if (ret < 0) {
4139 Py_XDECREF(repr);
4140 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004141 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004142
4143#ifdef NEED_RETRY
4144 if (size > INT_MAX) {
4145 p += INT_MAX;
4146 size -= INT_MAX;
4147 goto retry;
4148 }
4149#endif
4150
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004151 return repr;
4152}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004153
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004154PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4155{
4156 if (!PyUnicode_Check(unicode)) {
4157 PyErr_BadArgument();
4158 return NULL;
4159 }
4160 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4161 PyUnicode_GET_SIZE(unicode),
4162 NULL);
4163}
4164
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004165#undef NEED_RETRY
4166
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004167#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004168
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169/* --- Character Mapping Codec -------------------------------------------- */
4170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004172 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 PyObject *mapping,
4174 const char *errors)
4175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004177 Py_ssize_t startinpos;
4178 Py_ssize_t endinpos;
4179 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 PyUnicodeObject *v;
4182 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004183 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 PyObject *errorHandler = NULL;
4185 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004186 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004187 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004188
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 /* Default to Latin-1 */
4190 if (mapping == NULL)
4191 return PyUnicode_DecodeLatin1(s, size, errors);
4192
4193 v = _PyUnicode_New(size);
4194 if (v == NULL)
4195 goto onError;
4196 if (size == 0)
4197 return (PyObject *)v;
4198 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004200 if (PyUnicode_CheckExact(mapping)) {
4201 mapstring = PyUnicode_AS_UNICODE(mapping);
4202 maplen = PyUnicode_GET_SIZE(mapping);
4203 while (s < e) {
4204 unsigned char ch = *s;
4205 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004207 if (ch < maplen)
4208 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004210 if (x == 0xfffe) {
4211 /* undefined mapping */
4212 outpos = p-PyUnicode_AS_UNICODE(v);
4213 startinpos = s-starts;
4214 endinpos = startinpos+1;
4215 if (unicode_decode_call_errorhandler(
4216 errors, &errorHandler,
4217 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004218 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004219 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004220 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004221 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004222 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004223 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004224 *p++ = x;
4225 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004227 }
4228 else {
4229 while (s < e) {
4230 unsigned char ch = *s;
4231 PyObject *w, *x;
4232
4233 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004234 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004235 if (w == NULL)
4236 goto onError;
4237 x = PyObject_GetItem(mapping, w);
4238 Py_DECREF(w);
4239 if (x == NULL) {
4240 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4241 /* No mapping found means: mapping is undefined. */
4242 PyErr_Clear();
4243 x = Py_None;
4244 Py_INCREF(x);
4245 } else
4246 goto onError;
4247 }
4248
4249 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004250 if (PyLong_Check(x)) {
4251 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004252 if (value < 0 || value > 65535) {
4253 PyErr_SetString(PyExc_TypeError,
4254 "character mapping must be in range(65536)");
4255 Py_DECREF(x);
4256 goto onError;
4257 }
4258 *p++ = (Py_UNICODE)value;
4259 }
4260 else if (x == Py_None) {
4261 /* undefined mapping */
4262 outpos = p-PyUnicode_AS_UNICODE(v);
4263 startinpos = s-starts;
4264 endinpos = startinpos+1;
4265 if (unicode_decode_call_errorhandler(
4266 errors, &errorHandler,
4267 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004268 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004269 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004270 Py_DECREF(x);
4271 goto onError;
4272 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004273 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004274 continue;
4275 }
4276 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004278
4279 if (targetsize == 1)
4280 /* 1-1 mapping */
4281 *p++ = *PyUnicode_AS_UNICODE(x);
4282
4283 else if (targetsize > 1) {
4284 /* 1-n mapping */
4285 if (targetsize > extrachars) {
4286 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4288 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004289 (targetsize << 2);
4290 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004291 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004292 if (_PyUnicode_Resize(&v,
4293 PyUnicode_GET_SIZE(v) + needed) < 0) {
4294 Py_DECREF(x);
4295 goto onError;
4296 }
4297 p = PyUnicode_AS_UNICODE(v) + oldpos;
4298 }
4299 Py_UNICODE_COPY(p,
4300 PyUnicode_AS_UNICODE(x),
4301 targetsize);
4302 p += targetsize;
4303 extrachars -= targetsize;
4304 }
4305 /* 1-0 mapping: skip the character */
4306 }
4307 else {
4308 /* wrong return value */
4309 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004310 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004311 Py_DECREF(x);
4312 goto onError;
4313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004315 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 }
4318 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004319 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 Py_XDECREF(errorHandler);
4322 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 Py_XDECREF(errorHandler);
4327 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 Py_XDECREF(v);
4329 return NULL;
4330}
4331
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004332/* Charmap encoding: the lookup table */
4333
4334struct encoding_map{
4335 PyObject_HEAD
4336 unsigned char level1[32];
4337 int count2, count3;
4338 unsigned char level23[1];
4339};
4340
4341static PyObject*
4342encoding_map_size(PyObject *obj, PyObject* args)
4343{
4344 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004345 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346 128*map->count3);
4347}
4348
4349static PyMethodDef encoding_map_methods[] = {
4350 {"size", encoding_map_size, METH_NOARGS,
4351 PyDoc_STR("Return the size (in bytes) of this object") },
4352 { 0 }
4353};
4354
4355static void
4356encoding_map_dealloc(PyObject* o)
4357{
4358 PyObject_FREE(o);
4359}
4360
4361static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004362 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004363 "EncodingMap", /*tp_name*/
4364 sizeof(struct encoding_map), /*tp_basicsize*/
4365 0, /*tp_itemsize*/
4366 /* methods */
4367 encoding_map_dealloc, /*tp_dealloc*/
4368 0, /*tp_print*/
4369 0, /*tp_getattr*/
4370 0, /*tp_setattr*/
4371 0, /*tp_compare*/
4372 0, /*tp_repr*/
4373 0, /*tp_as_number*/
4374 0, /*tp_as_sequence*/
4375 0, /*tp_as_mapping*/
4376 0, /*tp_hash*/
4377 0, /*tp_call*/
4378 0, /*tp_str*/
4379 0, /*tp_getattro*/
4380 0, /*tp_setattro*/
4381 0, /*tp_as_buffer*/
4382 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4383 0, /*tp_doc*/
4384 0, /*tp_traverse*/
4385 0, /*tp_clear*/
4386 0, /*tp_richcompare*/
4387 0, /*tp_weaklistoffset*/
4388 0, /*tp_iter*/
4389 0, /*tp_iternext*/
4390 encoding_map_methods, /*tp_methods*/
4391 0, /*tp_members*/
4392 0, /*tp_getset*/
4393 0, /*tp_base*/
4394 0, /*tp_dict*/
4395 0, /*tp_descr_get*/
4396 0, /*tp_descr_set*/
4397 0, /*tp_dictoffset*/
4398 0, /*tp_init*/
4399 0, /*tp_alloc*/
4400 0, /*tp_new*/
4401 0, /*tp_free*/
4402 0, /*tp_is_gc*/
4403};
4404
4405PyObject*
4406PyUnicode_BuildEncodingMap(PyObject* string)
4407{
4408 Py_UNICODE *decode;
4409 PyObject *result;
4410 struct encoding_map *mresult;
4411 int i;
4412 int need_dict = 0;
4413 unsigned char level1[32];
4414 unsigned char level2[512];
4415 unsigned char *mlevel1, *mlevel2, *mlevel3;
4416 int count2 = 0, count3 = 0;
4417
4418 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4419 PyErr_BadArgument();
4420 return NULL;
4421 }
4422 decode = PyUnicode_AS_UNICODE(string);
4423 memset(level1, 0xFF, sizeof level1);
4424 memset(level2, 0xFF, sizeof level2);
4425
4426 /* If there isn't a one-to-one mapping of NULL to \0,
4427 or if there are non-BMP characters, we need to use
4428 a mapping dictionary. */
4429 if (decode[0] != 0)
4430 need_dict = 1;
4431 for (i = 1; i < 256; i++) {
4432 int l1, l2;
4433 if (decode[i] == 0
4434 #ifdef Py_UNICODE_WIDE
4435 || decode[i] > 0xFFFF
4436 #endif
4437 ) {
4438 need_dict = 1;
4439 break;
4440 }
4441 if (decode[i] == 0xFFFE)
4442 /* unmapped character */
4443 continue;
4444 l1 = decode[i] >> 11;
4445 l2 = decode[i] >> 7;
4446 if (level1[l1] == 0xFF)
4447 level1[l1] = count2++;
4448 if (level2[l2] == 0xFF)
4449 level2[l2] = count3++;
4450 }
4451
4452 if (count2 >= 0xFF || count3 >= 0xFF)
4453 need_dict = 1;
4454
4455 if (need_dict) {
4456 PyObject *result = PyDict_New();
4457 PyObject *key, *value;
4458 if (!result)
4459 return NULL;
4460 for (i = 0; i < 256; i++) {
4461 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004462 key = PyLong_FromLong(decode[i]);
4463 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004464 if (!key || !value)
4465 goto failed1;
4466 if (PyDict_SetItem(result, key, value) == -1)
4467 goto failed1;
4468 Py_DECREF(key);
4469 Py_DECREF(value);
4470 }
4471 return result;
4472 failed1:
4473 Py_XDECREF(key);
4474 Py_XDECREF(value);
4475 Py_DECREF(result);
4476 return NULL;
4477 }
4478
4479 /* Create a three-level trie */
4480 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4481 16*count2 + 128*count3 - 1);
4482 if (!result)
4483 return PyErr_NoMemory();
4484 PyObject_Init(result, &EncodingMapType);
4485 mresult = (struct encoding_map*)result;
4486 mresult->count2 = count2;
4487 mresult->count3 = count3;
4488 mlevel1 = mresult->level1;
4489 mlevel2 = mresult->level23;
4490 mlevel3 = mresult->level23 + 16*count2;
4491 memcpy(mlevel1, level1, 32);
4492 memset(mlevel2, 0xFF, 16*count2);
4493 memset(mlevel3, 0, 128*count3);
4494 count3 = 0;
4495 for (i = 1; i < 256; i++) {
4496 int o1, o2, o3, i2, i3;
4497 if (decode[i] == 0xFFFE)
4498 /* unmapped character */
4499 continue;
4500 o1 = decode[i]>>11;
4501 o2 = (decode[i]>>7) & 0xF;
4502 i2 = 16*mlevel1[o1] + o2;
4503 if (mlevel2[i2] == 0xFF)
4504 mlevel2[i2] = count3++;
4505 o3 = decode[i] & 0x7F;
4506 i3 = 128*mlevel2[i2] + o3;
4507 mlevel3[i3] = i;
4508 }
4509 return result;
4510}
4511
4512static int
4513encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4514{
4515 struct encoding_map *map = (struct encoding_map*)mapping;
4516 int l1 = c>>11;
4517 int l2 = (c>>7) & 0xF;
4518 int l3 = c & 0x7F;
4519 int i;
4520
4521#ifdef Py_UNICODE_WIDE
4522 if (c > 0xFFFF) {
4523 return -1;
4524 }
4525#endif
4526 if (c == 0)
4527 return 0;
4528 /* level 1*/
4529 i = map->level1[l1];
4530 if (i == 0xFF) {
4531 return -1;
4532 }
4533 /* level 2*/
4534 i = map->level23[16*i+l2];
4535 if (i == 0xFF) {
4536 return -1;
4537 }
4538 /* level 3 */
4539 i = map->level23[16*map->count2 + 128*i + l3];
4540 if (i == 0) {
4541 return -1;
4542 }
4543 return i;
4544}
4545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546/* Lookup the character ch in the mapping. If the character
4547 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004548 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550{
Christian Heimes217cfd12007-12-02 14:31:20 +00004551 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *x;
4553
4554 if (w == NULL)
4555 return NULL;
4556 x = PyObject_GetItem(mapping, w);
4557 Py_DECREF(w);
4558 if (x == NULL) {
4559 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4560 /* No mapping found means: mapping is undefined. */
4561 PyErr_Clear();
4562 x = Py_None;
4563 Py_INCREF(x);
4564 return x;
4565 } else
4566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004568 else if (x == Py_None)
4569 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004570 else if (PyLong_Check(x)) {
4571 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (value < 0 || value > 255) {
4573 PyErr_SetString(PyExc_TypeError,
4574 "character mapping must be in range(256)");
4575 Py_DECREF(x);
4576 return NULL;
4577 }
4578 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004580 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004584 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004585 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004586 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_DECREF(x);
4588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 }
4590}
4591
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004592static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004593charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004594{
Christian Heimes72b710a2008-05-26 13:28:38 +00004595 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004596 /* exponentially overallocate to minimize reallocations */
4597 if (requiredsize < 2*outsize)
4598 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004599 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004600 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004601 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004602}
4603
4604typedef enum charmapencode_result {
4605 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4606}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004608 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 space is available. Return a new reference to the object that
4610 was put in the output buffer, or Py_None, if the mapping was undefined
4611 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004612 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004614charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004615 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004617 PyObject *rep;
4618 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620
Christian Heimes90aa7642007-12-19 02:45:37 +00004621 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004622 int res = encoding_map_lookup(c, mapping);
4623 Py_ssize_t requiredsize = *outpos+1;
4624 if (res == -1)
4625 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004626 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004627 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004628 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004629 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004630 outstart[(*outpos)++] = (char)res;
4631 return enc_SUCCESS;
4632 }
4633
4634 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004636 return enc_EXCEPTION;
4637 else if (rep==Py_None) {
4638 Py_DECREF(rep);
4639 return enc_FAILED;
4640 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004641 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004643 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004644 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004646 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004648 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004649 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 }
4651 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004652 const char *repchars = PyBytes_AS_STRING(rep);
4653 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004654 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004655 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004656 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004658 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004660 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 memcpy(outstart + *outpos, repchars, repsize);
4662 *outpos += repsize;
4663 }
4664 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004665 Py_DECREF(rep);
4666 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667}
4668
4669/* handle an error in PyUnicode_EncodeCharmap
4670 Return 0 on success, -1 on error */
4671static
4672int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004675 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004676 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677{
4678 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t repsize;
4680 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 Py_UNICODE *uni2;
4682 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683 Py_ssize_t collstartpos = *inpos;
4684 Py_ssize_t collendpos = *inpos+1;
4685 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 char *encoding = "charmap";
4687 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004688 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 /* find all unencodable characters */
4691 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004692 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004693 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004694 int res = encoding_map_lookup(p[collendpos], mapping);
4695 if (res != -1)
4696 break;
4697 ++collendpos;
4698 continue;
4699 }
4700
4701 rep = charmapencode_lookup(p[collendpos], mapping);
4702 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004704 else if (rep!=Py_None) {
4705 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004706 break;
4707 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004708 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 ++collendpos;
4710 }
4711 /* cache callback name lookup
4712 * (if not done yet, i.e. it's the first error) */
4713 if (*known_errorHandler==-1) {
4714 if ((errors==NULL) || (!strcmp(errors, "strict")))
4715 *known_errorHandler = 1;
4716 else if (!strcmp(errors, "replace"))
4717 *known_errorHandler = 2;
4718 else if (!strcmp(errors, "ignore"))
4719 *known_errorHandler = 3;
4720 else if (!strcmp(errors, "xmlcharrefreplace"))
4721 *known_errorHandler = 4;
4722 else
4723 *known_errorHandler = 0;
4724 }
4725 switch (*known_errorHandler) {
4726 case 1: /* strict */
4727 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4728 return -1;
4729 case 2: /* replace */
4730 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4731 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004732 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 return -1;
4734 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004735 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 }
4740 /* fall through */
4741 case 3: /* ignore */
4742 *inpos = collendpos;
4743 break;
4744 case 4: /* xmlcharrefreplace */
4745 /* generate replacement (temporarily (mis)uses p) */
4746 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4747 char buffer[2+29+1+1];
4748 char *cp;
4749 sprintf(buffer, "&#%d;", (int)p[collpos]);
4750 for (cp = buffer; *cp; ++cp) {
4751 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004752 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004754 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4756 return -1;
4757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 }
4759 }
4760 *inpos = collendpos;
4761 break;
4762 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004763 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 encoding, reason, p, size, exceptionObject,
4765 collstartpos, collendpos, &newpos);
4766 if (repunicode == NULL)
4767 return -1;
4768 /* generate replacement */
4769 repsize = PyUnicode_GET_SIZE(repunicode);
4770 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4771 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004772 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 return -1;
4774 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004775 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4778 return -1;
4779 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 }
4781 *inpos = newpos;
4782 Py_DECREF(repunicode);
4783 }
4784 return 0;
4785}
4786
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 PyObject *mapping,
4790 const char *errors)
4791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 /* output object */
4793 PyObject *res = NULL;
4794 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004795 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 PyObject *errorHandler = NULL;
4799 PyObject *exc = NULL;
4800 /* the following variable is used for caching string comparisons
4801 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4802 * 3=ignore, 4=xmlcharrefreplace */
4803 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804
4805 /* Default to Latin-1 */
4806 if (mapping == NULL)
4807 return PyUnicode_EncodeLatin1(p, size, errors);
4808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 /* allocate enough for a simple encoding without
4810 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004811 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (res == NULL)
4813 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004814 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 while (inpos<size) {
4818 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004819 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004820 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004822 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 if (charmap_encoding_error(p, size, &inpos, mapping,
4824 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004825 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004826 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004827 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 else
4831 /* done with this character => adjust input position */
4832 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004836 if (respos<PyBytes_GET_SIZE(res))
4837 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 Py_XDECREF(exc);
4840 Py_XDECREF(errorHandler);
4841 return res;
4842
4843 onError:
4844 Py_XDECREF(res);
4845 Py_XDECREF(exc);
4846 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return NULL;
4848}
4849
4850PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4851 PyObject *mapping)
4852{
4853 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4854 PyErr_BadArgument();
4855 return NULL;
4856 }
4857 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4858 PyUnicode_GET_SIZE(unicode),
4859 mapping,
4860 NULL);
4861}
4862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863/* create or adjust a UnicodeTranslateError */
4864static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 const Py_UNICODE *unicode, Py_ssize_t size,
4866 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 if (*exceptionObject == NULL) {
4870 *exceptionObject = PyUnicodeTranslateError_Create(
4871 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 }
4873 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4875 goto onError;
4876 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4877 goto onError;
4878 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4879 goto onError;
4880 return;
4881 onError:
4882 Py_DECREF(*exceptionObject);
4883 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 }
4885}
4886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887/* raises a UnicodeTranslateError */
4888static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004889 const Py_UNICODE *unicode, Py_ssize_t size,
4890 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 const char *reason)
4892{
4893 make_translate_exception(exceptionObject,
4894 unicode, size, startpos, endpos, reason);
4895 if (*exceptionObject != NULL)
4896 PyCodec_StrictErrors(*exceptionObject);
4897}
4898
4899/* error handling callback helper:
4900 build arguments, call the callback and check the arguments,
4901 put the result into newpos and return the replacement string, which
4902 has to be freed by the caller */
4903static PyObject *unicode_translate_call_errorhandler(const char *errors,
4904 PyObject **errorHandler,
4905 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004906 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4907 Py_ssize_t startpos, Py_ssize_t endpos,
4908 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004910 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004912 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 PyObject *restuple;
4914 PyObject *resunicode;
4915
4916 if (*errorHandler == NULL) {
4917 *errorHandler = PyCodec_LookupError(errors);
4918 if (*errorHandler == NULL)
4919 return NULL;
4920 }
4921
4922 make_translate_exception(exceptionObject,
4923 unicode, size, startpos, endpos, reason);
4924 if (*exceptionObject == NULL)
4925 return NULL;
4926
4927 restuple = PyObject_CallFunctionObjArgs(
4928 *errorHandler, *exceptionObject, NULL);
4929 if (restuple == NULL)
4930 return NULL;
4931 if (!PyTuple_Check(restuple)) {
4932 PyErr_Format(PyExc_TypeError, &argparse[4]);
4933 Py_DECREF(restuple);
4934 return NULL;
4935 }
4936 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004937 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 Py_DECREF(restuple);
4939 return NULL;
4940 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 if (i_newpos<0)
4942 *newpos = size+i_newpos;
4943 else
4944 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004945 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004946 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004947 Py_DECREF(restuple);
4948 return NULL;
4949 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 Py_INCREF(resunicode);
4951 Py_DECREF(restuple);
4952 return resunicode;
4953}
4954
4955/* Lookup the character ch in the mapping and put the result in result,
4956 which must be decrefed by the caller.
4957 Return 0 on success, -1 on error */
4958static
4959int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4960{
Christian Heimes217cfd12007-12-02 14:31:20 +00004961 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 PyObject *x;
4963
4964 if (w == NULL)
4965 return -1;
4966 x = PyObject_GetItem(mapping, w);
4967 Py_DECREF(w);
4968 if (x == NULL) {
4969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4970 /* No mapping found means: use 1:1 mapping. */
4971 PyErr_Clear();
4972 *result = NULL;
4973 return 0;
4974 } else
4975 return -1;
4976 }
4977 else if (x == Py_None) {
4978 *result = x;
4979 return 0;
4980 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004981 else if (PyLong_Check(x)) {
4982 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 long max = PyUnicode_GetMax();
4984 if (value < 0 || value > max) {
4985 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004986 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 Py_DECREF(x);
4988 return -1;
4989 }
4990 *result = x;
4991 return 0;
4992 }
4993 else if (PyUnicode_Check(x)) {
4994 *result = x;
4995 return 0;
4996 }
4997 else {
4998 /* wrong return value */
4999 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005000 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005001 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 return -1;
5003 }
5004}
5005/* ensure that *outobj is at least requiredsize characters long,
5006if not reallocate and adjust various state variables.
5007Return 0 on success, -1 on error */
5008static
Walter Dörwald4894c302003-10-24 14:25:28 +00005009int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005010 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005012 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005013 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005017 if (requiredsize < 2 * oldsize)
5018 requiredsize = 2 * oldsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005019 if (PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 return -1;
5021 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 }
5023 return 0;
5024}
5025/* lookup the character, put the result in the output string and adjust
5026 various state variables. Return a new reference to the object that
5027 was put in the output buffer in *result, or Py_None, if the mapping was
5028 undefined (in which case no character was written).
5029 The called must decref result.
5030 Return 0 on success, -1 on error. */
5031static
Walter Dörwald4894c302003-10-24 14:25:28 +00005032int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005033 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005034 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035{
Walter Dörwald4894c302003-10-24 14:25:28 +00005036 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 return -1;
5038 if (*res==NULL) {
5039 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005040 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 }
5042 else if (*res==Py_None)
5043 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005044 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005046 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 }
5048 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005049 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 if (repsize==1) {
5051 /* no overflow check, because we know that the space is enough */
5052 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5053 }
5054 else if (repsize!=0) {
5055 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005057 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005058 repsize - 1;
5059 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 return -1;
5061 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5062 *outp += repsize;
5063 }
5064 }
5065 else
5066 return -1;
5067 return 0;
5068}
5069
5070PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005071 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 PyObject *mapping,
5073 const char *errors)
5074{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075 /* output object */
5076 PyObject *res = NULL;
5077 /* pointers to the beginning and end+1 of input */
5078 const Py_UNICODE *startp = p;
5079 const Py_UNICODE *endp = p + size;
5080 /* pointer into the output */
5081 Py_UNICODE *str;
5082 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005083 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 char *reason = "character maps to <undefined>";
5085 PyObject *errorHandler = NULL;
5086 PyObject *exc = NULL;
5087 /* the following variable is used for caching string comparisons
5088 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5089 * 3=ignore, 4=xmlcharrefreplace */
5090 int known_errorHandler = -1;
5091
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 if (mapping == NULL) {
5093 PyErr_BadArgument();
5094 return NULL;
5095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096
5097 /* allocate enough for a simple 1:1 translation without
5098 replacements, if we need more, we'll resize */
5099 res = PyUnicode_FromUnicode(NULL, size);
5100 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 return res;
5104 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 while (p<endp) {
5107 /* try to encode it */
5108 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005109 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 goto onError;
5112 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005113 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 if (x!=Py_None) /* it worked => adjust input pointer */
5115 ++p;
5116 else { /* untranslatable character */
5117 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005118 Py_ssize_t repsize;
5119 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005120 Py_UNICODE *uni2;
5121 /* startpos for collecting untranslatable chars */
5122 const Py_UNICODE *collstart = p;
5123 const Py_UNICODE *collend = p+1;
5124 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 /* find all untranslatable characters */
5127 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005128 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 goto onError;
5130 Py_XDECREF(x);
5131 if (x!=Py_None)
5132 break;
5133 ++collend;
5134 }
5135 /* cache callback name lookup
5136 * (if not done yet, i.e. it's the first error) */
5137 if (known_errorHandler==-1) {
5138 if ((errors==NULL) || (!strcmp(errors, "strict")))
5139 known_errorHandler = 1;
5140 else if (!strcmp(errors, "replace"))
5141 known_errorHandler = 2;
5142 else if (!strcmp(errors, "ignore"))
5143 known_errorHandler = 3;
5144 else if (!strcmp(errors, "xmlcharrefreplace"))
5145 known_errorHandler = 4;
5146 else
5147 known_errorHandler = 0;
5148 }
5149 switch (known_errorHandler) {
5150 case 1: /* strict */
5151 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5152 goto onError;
5153 case 2: /* replace */
5154 /* No need to check for space, this is a 1:1 replacement */
5155 for (coll = collstart; coll<collend; ++coll)
5156 *str++ = '?';
5157 /* fall through */
5158 case 3: /* ignore */
5159 p = collend;
5160 break;
5161 case 4: /* xmlcharrefreplace */
5162 /* generate replacement (temporarily (mis)uses p) */
5163 for (p = collstart; p < collend; ++p) {
5164 char buffer[2+29+1+1];
5165 char *cp;
5166 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005167 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5169 goto onError;
5170 for (cp = buffer; *cp; ++cp)
5171 *str++ = *cp;
5172 }
5173 p = collend;
5174 break;
5175 default:
5176 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5177 reason, startp, size, &exc,
5178 collstart-startp, collend-startp, &newpos);
5179 if (repunicode == NULL)
5180 goto onError;
5181 /* generate replacement */
5182 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005183 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5185 Py_DECREF(repunicode);
5186 goto onError;
5187 }
5188 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5189 *str++ = *uni2;
5190 p = startp + newpos;
5191 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 }
5193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 /* Resize if we allocated to much */
5196 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005197 if (respos<PyUnicode_GET_SIZE(res)) {
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005198 if (PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005199 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 }
5201 Py_XDECREF(exc);
5202 Py_XDECREF(errorHandler);
5203 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 onError:
5206 Py_XDECREF(res);
5207 Py_XDECREF(exc);
5208 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 return NULL;
5210}
5211
5212PyObject *PyUnicode_Translate(PyObject *str,
5213 PyObject *mapping,
5214 const char *errors)
5215{
5216 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 str = PyUnicode_FromObject(str);
5219 if (str == NULL)
5220 goto onError;
5221 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5222 PyUnicode_GET_SIZE(str),
5223 mapping,
5224 errors);
5225 Py_DECREF(str);
5226 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005227
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 onError:
5229 Py_XDECREF(str);
5230 return NULL;
5231}
Tim Petersced69f82003-09-16 20:30:58 +00005232
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233/* --- Decimal Encoder ---------------------------------------------------- */
5234
5235int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005237 char *output,
5238 const char *errors)
5239{
5240 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 PyObject *errorHandler = NULL;
5242 PyObject *exc = NULL;
5243 const char *encoding = "decimal";
5244 const char *reason = "invalid decimal Unicode string";
5245 /* the following variable is used for caching string comparisons
5246 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5247 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005248
5249 if (output == NULL) {
5250 PyErr_BadArgument();
5251 return -1;
5252 }
5253
5254 p = s;
5255 end = s + length;
5256 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005257 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005258 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005260 Py_ssize_t repsize;
5261 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262 Py_UNICODE *uni2;
5263 Py_UNICODE *collstart;
5264 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005265
Guido van Rossum9e896b32000-04-05 20:11:21 +00005266 if (Py_UNICODE_ISSPACE(ch)) {
5267 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005269 continue;
5270 }
5271 decimal = Py_UNICODE_TODECIMAL(ch);
5272 if (decimal >= 0) {
5273 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005275 continue;
5276 }
Guido van Rossumba477042000-04-06 18:18:10 +00005277 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005278 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005280 continue;
5281 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 /* All other characters are considered unencodable */
5283 collstart = p;
5284 collend = p+1;
5285 while (collend < end) {
5286 if ((0 < *collend && *collend < 256) ||
5287 !Py_UNICODE_ISSPACE(*collend) ||
5288 Py_UNICODE_TODECIMAL(*collend))
5289 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005290 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005291 /* cache callback name lookup
5292 * (if not done yet, i.e. it's the first error) */
5293 if (known_errorHandler==-1) {
5294 if ((errors==NULL) || (!strcmp(errors, "strict")))
5295 known_errorHandler = 1;
5296 else if (!strcmp(errors, "replace"))
5297 known_errorHandler = 2;
5298 else if (!strcmp(errors, "ignore"))
5299 known_errorHandler = 3;
5300 else if (!strcmp(errors, "xmlcharrefreplace"))
5301 known_errorHandler = 4;
5302 else
5303 known_errorHandler = 0;
5304 }
5305 switch (known_errorHandler) {
5306 case 1: /* strict */
5307 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5308 goto onError;
5309 case 2: /* replace */
5310 for (p = collstart; p < collend; ++p)
5311 *output++ = '?';
5312 /* fall through */
5313 case 3: /* ignore */
5314 p = collend;
5315 break;
5316 case 4: /* xmlcharrefreplace */
5317 /* generate replacement (temporarily (mis)uses p) */
5318 for (p = collstart; p < collend; ++p)
5319 output += sprintf(output, "&#%d;", (int)*p);
5320 p = collend;
5321 break;
5322 default:
5323 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5324 encoding, reason, s, length, &exc,
5325 collstart-s, collend-s, &newpos);
5326 if (repunicode == NULL)
5327 goto onError;
5328 /* generate replacement */
5329 repsize = PyUnicode_GET_SIZE(repunicode);
5330 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5331 Py_UNICODE ch = *uni2;
5332 if (Py_UNICODE_ISSPACE(ch))
5333 *output++ = ' ';
5334 else {
5335 decimal = Py_UNICODE_TODECIMAL(ch);
5336 if (decimal >= 0)
5337 *output++ = '0' + decimal;
5338 else if (0 < ch && ch < 256)
5339 *output++ = (char)ch;
5340 else {
5341 Py_DECREF(repunicode);
5342 raise_encode_exception(&exc, encoding,
5343 s, length, collstart-s, collend-s, reason);
5344 goto onError;
5345 }
5346 }
5347 }
5348 p = s + newpos;
5349 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005350 }
5351 }
5352 /* 0-terminate the output string */
5353 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354 Py_XDECREF(exc);
5355 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005356 return 0;
5357
5358 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 Py_XDECREF(exc);
5360 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005361 return -1;
5362}
5363
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364/* --- Helpers ------------------------------------------------------------ */
5365
Eric Smith8c663262007-08-25 02:26:07 +00005366#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005368#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005369/* Include _ParseTupleFinds from find.h */
5370#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005371#include "stringlib/find.h"
5372#include "stringlib/partition.h"
5373
Eric Smith5807c412008-05-11 21:00:57 +00005374#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5375#include "stringlib/localeutil.h"
5376
Thomas Wouters477c8d52006-05-27 19:21:47 +00005377/* helper macro to fixup start/end slice values */
5378#define FIX_START_END(obj) \
5379 if (start < 0) \
5380 start += (obj)->length; \
5381 if (start < 0) \
5382 start = 0; \
5383 if (end > (obj)->length) \
5384 end = (obj)->length; \
5385 if (end < 0) \
5386 end += (obj)->length; \
5387 if (end < 0) \
5388 end = 0;
5389
Martin v. Löwis18e16552006-02-15 17:27:45 +00005390Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005391 PyObject *substr,
5392 Py_ssize_t start,
5393 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005395 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005396 PyUnicodeObject* str_obj;
5397 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005398
Thomas Wouters477c8d52006-05-27 19:21:47 +00005399 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5400 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005402 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5403 if (!sub_obj) {
5404 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 return -1;
5406 }
Tim Petersced69f82003-09-16 20:30:58 +00005407
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005409
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 result = stringlib_count(
5411 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5412 );
5413
5414 Py_DECREF(sub_obj);
5415 Py_DECREF(str_obj);
5416
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 return result;
5418}
5419
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005421 PyObject *sub,
5422 Py_ssize_t start,
5423 Py_ssize_t end,
5424 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005426 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005427
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005429 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005430 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005431 sub = PyUnicode_FromObject(sub);
5432 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005433 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005434 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
Tim Petersced69f82003-09-16 20:30:58 +00005436
Thomas Wouters477c8d52006-05-27 19:21:47 +00005437 if (direction > 0)
5438 result = stringlib_find_slice(
5439 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5440 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5441 start, end
5442 );
5443 else
5444 result = stringlib_rfind_slice(
5445 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5446 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5447 start, end
5448 );
5449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005451 Py_DECREF(sub);
5452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 return result;
5454}
5455
Tim Petersced69f82003-09-16 20:30:58 +00005456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457int tailmatch(PyUnicodeObject *self,
5458 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459 Py_ssize_t start,
5460 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 int direction)
5462{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 if (substring->length == 0)
5464 return 1;
5465
Thomas Wouters477c8d52006-05-27 19:21:47 +00005466 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
5468 end -= substring->length;
5469 if (end < start)
5470 return 0;
5471
5472 if (direction > 0) {
5473 if (Py_UNICODE_MATCH(self, end, substring))
5474 return 1;
5475 } else {
5476 if (Py_UNICODE_MATCH(self, start, substring))
5477 return 1;
5478 }
5479
5480 return 0;
5481}
5482
Martin v. Löwis18e16552006-02-15 17:27:45 +00005483Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005485 Py_ssize_t start,
5486 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 int direction)
5488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 str = PyUnicode_FromObject(str);
5492 if (str == NULL)
5493 return -1;
5494 substr = PyUnicode_FromObject(substr);
5495 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005496 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 return -1;
5498 }
Tim Petersced69f82003-09-16 20:30:58 +00005499
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 result = tailmatch((PyUnicodeObject *)str,
5501 (PyUnicodeObject *)substr,
5502 start, end, direction);
5503 Py_DECREF(str);
5504 Py_DECREF(substr);
5505 return result;
5506}
5507
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508/* Apply fixfct filter to the Unicode object self and return a
5509 reference to the modified object */
5510
Tim Petersced69f82003-09-16 20:30:58 +00005511static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512PyObject *fixup(PyUnicodeObject *self,
5513 int (*fixfct)(PyUnicodeObject *s))
5514{
5515
5516 PyUnicodeObject *u;
5517
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005518 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (u == NULL)
5520 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005521
5522 Py_UNICODE_COPY(u->str, self->str, self->length);
5523
Tim Peters7a29bd52001-09-12 03:03:31 +00005524 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 /* fixfct should return TRUE if it modified the buffer. If
5526 FALSE, return a reference to the original buffer instead
5527 (to save space, not time) */
5528 Py_INCREF(self);
5529 Py_DECREF(u);
5530 return (PyObject*) self;
5531 }
5532 return (PyObject*) u;
5533}
5534
Tim Petersced69f82003-09-16 20:30:58 +00005535static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536int fixupper(PyUnicodeObject *self)
5537{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005538 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 Py_UNICODE *s = self->str;
5540 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005541
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 while (len-- > 0) {
5543 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 ch = Py_UNICODE_TOUPPER(*s);
5546 if (ch != *s) {
5547 status = 1;
5548 *s = ch;
5549 }
5550 s++;
5551 }
5552
5553 return status;
5554}
5555
Tim Petersced69f82003-09-16 20:30:58 +00005556static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557int fixlower(PyUnicodeObject *self)
5558{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005559 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 Py_UNICODE *s = self->str;
5561 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 while (len-- > 0) {
5564 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005565
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 ch = Py_UNICODE_TOLOWER(*s);
5567 if (ch != *s) {
5568 status = 1;
5569 *s = ch;
5570 }
5571 s++;
5572 }
5573
5574 return status;
5575}
5576
Tim Petersced69f82003-09-16 20:30:58 +00005577static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578int fixswapcase(PyUnicodeObject *self)
5579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005580 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 Py_UNICODE *s = self->str;
5582 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 while (len-- > 0) {
5585 if (Py_UNICODE_ISUPPER(*s)) {
5586 *s = Py_UNICODE_TOLOWER(*s);
5587 status = 1;
5588 } else if (Py_UNICODE_ISLOWER(*s)) {
5589 *s = Py_UNICODE_TOUPPER(*s);
5590 status = 1;
5591 }
5592 s++;
5593 }
5594
5595 return status;
5596}
5597
Tim Petersced69f82003-09-16 20:30:58 +00005598static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599int fixcapitalize(PyUnicodeObject *self)
5600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005602 Py_UNICODE *s = self->str;
5603 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005604
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005605 if (len == 0)
5606 return 0;
5607 if (Py_UNICODE_ISLOWER(*s)) {
5608 *s = Py_UNICODE_TOUPPER(*s);
5609 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005611 s++;
5612 while (--len > 0) {
5613 if (Py_UNICODE_ISUPPER(*s)) {
5614 *s = Py_UNICODE_TOLOWER(*s);
5615 status = 1;
5616 }
5617 s++;
5618 }
5619 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622static
5623int fixtitle(PyUnicodeObject *self)
5624{
5625 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5626 register Py_UNICODE *e;
5627 int previous_is_cased;
5628
5629 /* Shortcut for single character strings */
5630 if (PyUnicode_GET_SIZE(self) == 1) {
5631 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5632 if (*p != ch) {
5633 *p = ch;
5634 return 1;
5635 }
5636 else
5637 return 0;
5638 }
Tim Petersced69f82003-09-16 20:30:58 +00005639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 e = p + PyUnicode_GET_SIZE(self);
5641 previous_is_cased = 0;
5642 for (; p < e; p++) {
5643 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005644
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 if (previous_is_cased)
5646 *p = Py_UNICODE_TOLOWER(ch);
5647 else
5648 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005649
5650 if (Py_UNICODE_ISLOWER(ch) ||
5651 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 Py_UNICODE_ISTITLE(ch))
5653 previous_is_cased = 1;
5654 else
5655 previous_is_cased = 0;
5656 }
5657 return 1;
5658}
5659
Tim Peters8ce9f162004-08-27 01:49:32 +00005660PyObject *
5661PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662{
Skip Montanaro6543b452004-09-16 03:28:13 +00005663 const Py_UNICODE blank = ' ';
5664 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005665 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005667 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5668 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005669 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5670 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005671 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005672 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 fseq = PySequence_Fast(seq, "");
5675 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005677 }
5678
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005679 /* NOTE: the following code can't call back into Python code,
5680 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005681 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005682
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 seqlen = PySequence_Fast_GET_SIZE(fseq);
5684 /* If empty sequence, return u"". */
5685 if (seqlen == 0) {
5686 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5687 goto Done;
5688 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005689 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005690 /* If singleton sequence with an exact Unicode, return that. */
5691 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005692 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005693 if (PyUnicode_CheckExact(item)) {
5694 Py_INCREF(item);
5695 res = (PyUnicodeObject *)item;
5696 goto Done;
5697 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005698 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005699 else {
5700 /* Set up sep and seplen */
5701 if (separator == NULL) {
5702 sep = &blank;
5703 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005704 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005705 else {
5706 if (!PyUnicode_Check(separator)) {
5707 PyErr_Format(PyExc_TypeError,
5708 "separator: expected str instance,"
5709 " %.80s found",
5710 Py_TYPE(separator)->tp_name);
5711 goto onError;
5712 }
5713 sep = PyUnicode_AS_UNICODE(separator);
5714 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005715 }
5716 }
5717
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005718 /* There are at least two things to join, or else we have a subclass
5719 * of str in the sequence.
5720 * Do a pre-pass to figure out the total amount of space we'll
5721 * need (sz), and see whether all argument are strings.
5722 */
5723 sz = 0;
5724 for (i = 0; i < seqlen; i++) {
5725 const Py_ssize_t old_sz = sz;
5726 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005727 if (!PyUnicode_Check(item)) {
5728 PyErr_Format(PyExc_TypeError,
5729 "sequence item %zd: expected str instance,"
5730 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005731 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005732 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005733 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005734 sz += PyUnicode_GET_SIZE(item);
5735 if (i != 0)
5736 sz += seplen;
5737 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5738 PyErr_SetString(PyExc_OverflowError,
5739 "join() result is too long for a Python string");
5740 goto onError;
5741 }
5742 }
Tim Petersced69f82003-09-16 20:30:58 +00005743
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005744 res = _PyUnicode_New(sz);
5745 if (res == NULL)
5746 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005747
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005748 /* Catenate everything. */
5749 res_p = PyUnicode_AS_UNICODE(res);
5750 for (i = 0; i < seqlen; ++i) {
5751 Py_ssize_t itemlen;
5752 item = items[i];
5753 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005754 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005755 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005756 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005757 res_p += seplen;
5758 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005759 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5760 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005761 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005762
Tim Peters8ce9f162004-08-27 01:49:32 +00005763 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005764 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 return (PyObject *)res;
5766
5767 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005768 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005769 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 return NULL;
5771}
5772
Tim Petersced69f82003-09-16 20:30:58 +00005773static
5774PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t left,
5776 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 Py_UNICODE fill)
5778{
5779 PyUnicodeObject *u;
5780
5781 if (left < 0)
5782 left = 0;
5783 if (right < 0)
5784 right = 0;
5785
Tim Peters7a29bd52001-09-12 03:03:31 +00005786 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 Py_INCREF(self);
5788 return self;
5789 }
5790
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005791 if (left > PY_SSIZE_T_MAX - self->length ||
5792 right > PY_SSIZE_T_MAX - (left + self->length)) {
5793 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5794 return NULL;
5795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 u = _PyUnicode_New(left + self->length + right);
5797 if (u) {
5798 if (left)
5799 Py_UNICODE_FILL(u->str, fill, left);
5800 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5801 if (right)
5802 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5803 }
5804
5805 return u;
5806}
5807
5808#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 if (!str) \
5811 goto onError; \
5812 if (PyList_Append(list, str)) { \
5813 Py_DECREF(str); \
5814 goto onError; \
5815 } \
5816 else \
5817 Py_DECREF(str);
5818
5819static
5820PyObject *split_whitespace(PyUnicodeObject *self,
5821 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824 register Py_ssize_t i;
5825 register Py_ssize_t j;
5826 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005828 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
5830 for (i = j = 0; i < len; ) {
5831 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005832 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 i++;
5834 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005835 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 i++;
5837 if (j < i) {
5838 if (maxcount-- <= 0)
5839 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005840 SPLIT_APPEND(buf, j, i);
5841 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 i++;
5843 j = i;
5844 }
5845 }
5846 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005847 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
5849 return list;
5850
5851 onError:
5852 Py_DECREF(list);
5853 return NULL;
5854}
5855
5856PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005857 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 register Py_ssize_t i;
5860 register Py_ssize_t j;
5861 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 PyObject *list;
5863 PyObject *str;
5864 Py_UNICODE *data;
5865
5866 string = PyUnicode_FromObject(string);
5867 if (string == NULL)
5868 return NULL;
5869 data = PyUnicode_AS_UNICODE(string);
5870 len = PyUnicode_GET_SIZE(string);
5871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 list = PyList_New(0);
5873 if (!list)
5874 goto onError;
5875
5876 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005877 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005878
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882
5883 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005884 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 if (i < len) {
5886 if (data[i] == '\r' && i + 1 < len &&
5887 data[i+1] == '\n')
5888 i += 2;
5889 else
5890 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005891 if (keepends)
5892 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 }
Guido van Rossum86662912000-04-11 15:38:46 +00005894 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 j = i;
5896 }
5897 if (j < len) {
5898 SPLIT_APPEND(data, j, len);
5899 }
5900
5901 Py_DECREF(string);
5902 return list;
5903
5904 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005905 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 Py_DECREF(string);
5907 return NULL;
5908}
5909
Tim Petersced69f82003-09-16 20:30:58 +00005910static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911PyObject *split_char(PyUnicodeObject *self,
5912 PyObject *list,
5913 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005914 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005916 register Py_ssize_t i;
5917 register Py_ssize_t j;
5918 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005920 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
5922 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005923 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (maxcount-- <= 0)
5925 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005926 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 i = j = i + 1;
5928 } else
5929 i++;
5930 }
5931 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005932 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
5934 return list;
5935
5936 onError:
5937 Py_DECREF(list);
5938 return NULL;
5939}
5940
Tim Petersced69f82003-09-16 20:30:58 +00005941static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942PyObject *split_substring(PyUnicodeObject *self,
5943 PyObject *list,
5944 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005945 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005947 register Py_ssize_t i;
5948 register Py_ssize_t j;
5949 Py_ssize_t len = self->length;
5950 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 PyObject *str;
5952
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005953 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 if (Py_UNICODE_MATCH(self, i, substring)) {
5955 if (maxcount-- <= 0)
5956 break;
5957 SPLIT_APPEND(self->str, j, i);
5958 i = j = i + sublen;
5959 } else
5960 i++;
5961 }
5962 if (j <= len) {
5963 SPLIT_APPEND(self->str, j, len);
5964 }
5965 return list;
5966
5967 onError:
5968 Py_DECREF(list);
5969 return NULL;
5970}
5971
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005972static
5973PyObject *rsplit_whitespace(PyUnicodeObject *self,
5974 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005975 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005977 register Py_ssize_t i;
5978 register Py_ssize_t j;
5979 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005980 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005981 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005982
5983 for (i = j = len - 1; i >= 0; ) {
5984 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005985 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005986 i--;
5987 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005988 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005989 i--;
5990 if (j > i) {
5991 if (maxcount-- <= 0)
5992 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005993 SPLIT_APPEND(buf, i + 1, j + 1);
5994 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005995 i--;
5996 j = i;
5997 }
5998 }
5999 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006000 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006001 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 if (PyList_Reverse(list) < 0)
6003 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004 return list;
6005
6006 onError:
6007 Py_DECREF(list);
6008 return NULL;
6009}
6010
6011static
6012PyObject *rsplit_char(PyUnicodeObject *self,
6013 PyObject *list,
6014 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006017 register Py_ssize_t i;
6018 register Py_ssize_t j;
6019 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006020 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006021 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006022
6023 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006024 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006025 if (maxcount-- <= 0)
6026 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006027 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006028 j = i = i - 1;
6029 } else
6030 i--;
6031 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006032 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006033 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006034 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006035 if (PyList_Reverse(list) < 0)
6036 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006037 return list;
6038
6039 onError:
6040 Py_DECREF(list);
6041 return NULL;
6042}
6043
6044static
6045PyObject *rsplit_substring(PyUnicodeObject *self,
6046 PyObject *list,
6047 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006048 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006049{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 register Py_ssize_t i;
6051 register Py_ssize_t j;
6052 Py_ssize_t len = self->length;
6053 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006054 PyObject *str;
6055
6056 for (i = len - sublen, j = len; i >= 0; ) {
6057 if (Py_UNICODE_MATCH(self, i, substring)) {
6058 if (maxcount-- <= 0)
6059 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006060 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006061 j = i;
6062 i -= sublen;
6063 } else
6064 i--;
6065 }
6066 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006067 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006068 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069 if (PyList_Reverse(list) < 0)
6070 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006071 return list;
6072
6073 onError:
6074 Py_DECREF(list);
6075 return NULL;
6076}
6077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078#undef SPLIT_APPEND
6079
6080static
6081PyObject *split(PyUnicodeObject *self,
6082 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006083 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
6085 PyObject *list;
6086
6087 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006088 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090 list = PyList_New(0);
6091 if (!list)
6092 return NULL;
6093
6094 if (substring == NULL)
6095 return split_whitespace(self,list,maxcount);
6096
6097 else if (substring->length == 1)
6098 return split_char(self,list,substring->str[0],maxcount);
6099
6100 else if (substring->length == 0) {
6101 Py_DECREF(list);
6102 PyErr_SetString(PyExc_ValueError, "empty separator");
6103 return NULL;
6104 }
6105 else
6106 return split_substring(self,list,substring,maxcount);
6107}
6108
Tim Petersced69f82003-09-16 20:30:58 +00006109static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006110PyObject *rsplit(PyUnicodeObject *self,
6111 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006113{
6114 PyObject *list;
6115
6116 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006117 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006118
6119 list = PyList_New(0);
6120 if (!list)
6121 return NULL;
6122
6123 if (substring == NULL)
6124 return rsplit_whitespace(self,list,maxcount);
6125
6126 else if (substring->length == 1)
6127 return rsplit_char(self,list,substring->str[0],maxcount);
6128
6129 else if (substring->length == 0) {
6130 Py_DECREF(list);
6131 PyErr_SetString(PyExc_ValueError, "empty separator");
6132 return NULL;
6133 }
6134 else
6135 return rsplit_substring(self,list,substring,maxcount);
6136}
6137
6138static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139PyObject *replace(PyUnicodeObject *self,
6140 PyUnicodeObject *str1,
6141 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006142 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
6144 PyUnicodeObject *u;
6145
6146 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006147 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Thomas Wouters477c8d52006-05-27 19:21:47 +00006149 if (str1->length == str2->length) {
6150 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006151 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006152 if (str1->length == 1) {
6153 /* replace characters */
6154 Py_UNICODE u1, u2;
6155 if (!findchar(self->str, self->length, str1->str[0]))
6156 goto nothing;
6157 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6158 if (!u)
6159 return NULL;
6160 Py_UNICODE_COPY(u->str, self->str, self->length);
6161 u1 = str1->str[0];
6162 u2 = str2->str[0];
6163 for (i = 0; i < u->length; i++)
6164 if (u->str[i] == u1) {
6165 if (--maxcount < 0)
6166 break;
6167 u->str[i] = u2;
6168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006170 i = fastsearch(
6171 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006173 if (i < 0)
6174 goto nothing;
6175 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6176 if (!u)
6177 return NULL;
6178 Py_UNICODE_COPY(u->str, self->str, self->length);
6179 while (i <= self->length - str1->length)
6180 if (Py_UNICODE_MATCH(self, i, str1)) {
6181 if (--maxcount < 0)
6182 break;
6183 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6184 i += str1->length;
6185 } else
6186 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006189
6190 Py_ssize_t n, i, j, e;
6191 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 Py_UNICODE *p;
6193
6194 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006195 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 if (n > maxcount)
6197 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006198 if (n == 0)
6199 goto nothing;
6200 /* new_size = self->length + n * (str2->length - str1->length)); */
6201 delta = (str2->length - str1->length);
6202 if (delta == 0) {
6203 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006205 product = n * (str2->length - str1->length);
6206 if ((product / (str2->length - str1->length)) != n) {
6207 PyErr_SetString(PyExc_OverflowError,
6208 "replace string is too long");
6209 return NULL;
6210 }
6211 new_size = self->length + product;
6212 if (new_size < 0) {
6213 PyErr_SetString(PyExc_OverflowError,
6214 "replace string is too long");
6215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 }
6217 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006218 u = _PyUnicode_New(new_size);
6219 if (!u)
6220 return NULL;
6221 i = 0;
6222 p = u->str;
6223 e = self->length - str1->length;
6224 if (str1->length > 0) {
6225 while (n-- > 0) {
6226 /* look for next match */
6227 j = i;
6228 while (j <= e) {
6229 if (Py_UNICODE_MATCH(self, j, str1))
6230 break;
6231 j++;
6232 }
6233 if (j > i) {
6234 if (j > e)
6235 break;
6236 /* copy unchanged part [i:j] */
6237 Py_UNICODE_COPY(p, self->str+i, j-i);
6238 p += j - i;
6239 }
6240 /* copy substitution string */
6241 if (str2->length > 0) {
6242 Py_UNICODE_COPY(p, str2->str, str2->length);
6243 p += str2->length;
6244 }
6245 i = j + str1->length;
6246 }
6247 if (i < self->length)
6248 /* copy tail [i:] */
6249 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6250 } else {
6251 /* interleave */
6252 while (n > 0) {
6253 Py_UNICODE_COPY(p, str2->str, str2->length);
6254 p += str2->length;
6255 if (--n <= 0)
6256 break;
6257 *p++ = self->str[i++];
6258 }
6259 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006263
6264nothing:
6265 /* nothing to replace; return original string (when possible) */
6266 if (PyUnicode_CheckExact(self)) {
6267 Py_INCREF(self);
6268 return (PyObject *) self;
6269 }
6270 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
6273/* --- Unicode Object Methods --------------------------------------------- */
6274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006275PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006276"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277\n\
6278Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006279characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
6281static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006282unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 return fixup(self, fixtitle);
6285}
6286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006287PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006288"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289\n\
6290Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006291have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
6293static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006294unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 return fixup(self, fixcapitalize);
6297}
6298
6299#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006301"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302\n\
6303Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006304normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305
6306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006307unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
6309 PyObject *list;
6310 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006311 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 /* Split into words */
6314 list = split(self, NULL, -1);
6315 if (!list)
6316 return NULL;
6317
6318 /* Capitalize each word */
6319 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6320 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6321 fixcapitalize);
6322 if (item == NULL)
6323 goto onError;
6324 Py_DECREF(PyList_GET_ITEM(list, i));
6325 PyList_SET_ITEM(list, i, item);
6326 }
6327
6328 /* Join the words to form a new string */
6329 item = PyUnicode_Join(NULL, list);
6330
6331onError:
6332 Py_DECREF(list);
6333 return (PyObject *)item;
6334}
6335#endif
6336
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006337/* Argument converter. Coerces to a single unicode character */
6338
6339static int
6340convert_uc(PyObject *obj, void *addr)
6341{
6342 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6343 PyObject *uniobj;
6344 Py_UNICODE *unistr;
6345
6346 uniobj = PyUnicode_FromObject(obj);
6347 if (uniobj == NULL) {
6348 PyErr_SetString(PyExc_TypeError,
6349 "The fill character cannot be converted to Unicode");
6350 return 0;
6351 }
6352 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6353 PyErr_SetString(PyExc_TypeError,
6354 "The fill character must be exactly one character long");
6355 Py_DECREF(uniobj);
6356 return 0;
6357 }
6358 unistr = PyUnicode_AS_UNICODE(uniobj);
6359 *fillcharloc = unistr[0];
6360 Py_DECREF(uniobj);
6361 return 1;
6362}
6363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006364PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006365"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006367Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006368done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
6370static PyObject *
6371unicode_center(PyUnicodeObject *self, PyObject *args)
6372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006373 Py_ssize_t marg, left;
6374 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006375 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376
Thomas Woutersde017742006-02-16 19:34:37 +00006377 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 return NULL;
6379
Tim Peters7a29bd52001-09-12 03:03:31 +00006380 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 Py_INCREF(self);
6382 return (PyObject*) self;
6383 }
6384
6385 marg = width - self->length;
6386 left = marg / 2 + (marg & width & 1);
6387
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006388 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
Marc-André Lemburge5034372000-08-08 08:04:29 +00006391#if 0
6392
6393/* This code should go into some future Unicode collation support
6394 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006395 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006396
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006397/* speedy UTF-16 code point order comparison */
6398/* gleaned from: */
6399/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6400
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006401static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006402{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006403 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006404 0, 0, 0, 0, 0, 0, 0, 0,
6405 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006406 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006407};
6408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409static int
6410unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6411{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006412 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 Py_UNICODE *s1 = str1->str;
6415 Py_UNICODE *s2 = str2->str;
6416
6417 len1 = str1->length;
6418 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006421 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006422
6423 c1 = *s1++;
6424 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006425
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006426 if (c1 > (1<<11) * 26)
6427 c1 += utf16Fixup[c1>>11];
6428 if (c2 > (1<<11) * 26)
6429 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006430 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006431
6432 if (c1 != c2)
6433 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006434
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006435 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 }
6437
6438 return (len1 < len2) ? -1 : (len1 != len2);
6439}
6440
Marc-André Lemburge5034372000-08-08 08:04:29 +00006441#else
6442
6443static int
6444unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006446 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006447
6448 Py_UNICODE *s1 = str1->str;
6449 Py_UNICODE *s2 = str2->str;
6450
6451 len1 = str1->length;
6452 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006453
Marc-André Lemburge5034372000-08-08 08:04:29 +00006454 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006455 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006456
Fredrik Lundh45714e92001-06-26 16:39:36 +00006457 c1 = *s1++;
6458 c2 = *s2++;
6459
6460 if (c1 != c2)
6461 return (c1 < c2) ? -1 : 1;
6462
Marc-André Lemburge5034372000-08-08 08:04:29 +00006463 len1--; len2--;
6464 }
6465
6466 return (len1 < len2) ? -1 : (len1 != len2);
6467}
6468
6469#endif
6470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471int PyUnicode_Compare(PyObject *left,
6472 PyObject *right)
6473{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006474 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6475 return unicode_compare((PyUnicodeObject *)left,
6476 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006477 PyErr_Format(PyExc_TypeError,
6478 "Can't compare %.100s and %.100s",
6479 left->ob_type->tp_name,
6480 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 return -1;
6482}
6483
Martin v. Löwis5b222132007-06-10 09:51:05 +00006484int
6485PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6486{
6487 int i;
6488 Py_UNICODE *id;
6489 assert(PyUnicode_Check(uni));
6490 id = PyUnicode_AS_UNICODE(uni);
6491 /* Compare Unicode string and source character set string */
6492 for (i = 0; id[i] && str[i]; i++)
6493 if (id[i] != str[i])
6494 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6495 if (id[i])
6496 return 1; /* uni is longer */
6497 if (str[i])
6498 return -1; /* str is longer */
6499 return 0;
6500}
6501
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006502
6503#define TEST_COND(cond) \
6504 ((cond) ? Py_True : Py_False)
6505
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006506PyObject *PyUnicode_RichCompare(PyObject *left,
6507 PyObject *right,
6508 int op)
6509{
6510 int result;
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006511
6512 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6513 PyObject *v;
6514 if (((PyUnicodeObject *) left)->length !=
6515 ((PyUnicodeObject *) right)->length) {
6516 if (op == Py_EQ) {
6517 Py_INCREF(Py_False);
6518 return Py_False;
6519 }
6520 if (op == Py_NE) {
6521 Py_INCREF(Py_True);
6522 return Py_True;
6523 }
6524 }
6525 if (left == right)
6526 result = 0;
6527 else
6528 result = unicode_compare((PyUnicodeObject *)left,
6529 (PyUnicodeObject *)right);
6530
6531 /* Convert the return value to a Boolean */
6532 switch (op) {
6533 case Py_EQ:
6534 v = TEST_COND(result == 0);
6535 break;
6536 case Py_NE:
6537 v = TEST_COND(result != 0);
6538 break;
6539 case Py_LE:
6540 v = TEST_COND(result <= 0);
6541 break;
6542 case Py_GE:
6543 v = TEST_COND(result >= 0);
6544 break;
6545 case Py_LT:
6546 v = TEST_COND(result == -1);
6547 break;
6548 case Py_GT:
6549 v = TEST_COND(result == 1);
6550 break;
6551 default:
6552 PyErr_BadArgument();
6553 return NULL;
6554 }
6555 Py_INCREF(v);
6556 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006557 }
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006558
6559 Py_INCREF(Py_NotImplemented);
6560 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006561}
6562
Guido van Rossum403d68b2000-03-13 15:55:09 +00006563int PyUnicode_Contains(PyObject *container,
6564 PyObject *element)
6565{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006567 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006568
6569 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 sub = PyUnicode_FromObject(element);
6571 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006572 PyErr_Format(PyExc_TypeError,
6573 "'in <string>' requires string as left operand, not %s",
6574 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006576 }
6577
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 str = PyUnicode_FromObject(container);
6579 if (!str) {
6580 Py_DECREF(sub);
6581 return -1;
6582 }
6583
6584 result = stringlib_contains_obj(str, sub);
6585
6586 Py_DECREF(str);
6587 Py_DECREF(sub);
6588
Guido van Rossum403d68b2000-03-13 15:55:09 +00006589 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006590}
6591
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592/* Concat to string or Unicode object giving a new Unicode object. */
6593
6594PyObject *PyUnicode_Concat(PyObject *left,
6595 PyObject *right)
6596{
6597 PyUnicodeObject *u = NULL, *v = NULL, *w;
6598
6599 /* Coerce the two arguments */
6600 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6601 if (u == NULL)
6602 goto onError;
6603 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6604 if (v == NULL)
6605 goto onError;
6606
6607 /* Shortcuts */
6608 if (v == unicode_empty) {
6609 Py_DECREF(v);
6610 return (PyObject *)u;
6611 }
6612 if (u == unicode_empty) {
6613 Py_DECREF(u);
6614 return (PyObject *)v;
6615 }
6616
6617 /* Concat the two Unicode strings */
6618 w = _PyUnicode_New(u->length + v->length);
6619 if (w == NULL)
6620 goto onError;
6621 Py_UNICODE_COPY(w->str, u->str, u->length);
6622 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6623
6624 Py_DECREF(u);
6625 Py_DECREF(v);
6626 return (PyObject *)w;
6627
6628onError:
6629 Py_XDECREF(u);
6630 Py_XDECREF(v);
6631 return NULL;
6632}
6633
Walter Dörwald1ab83302007-05-18 17:15:44 +00006634void
6635PyUnicode_Append(PyObject **pleft, PyObject *right)
6636{
6637 PyObject *new;
6638 if (*pleft == NULL)
6639 return;
6640 if (right == NULL || !PyUnicode_Check(*pleft)) {
6641 Py_DECREF(*pleft);
6642 *pleft = NULL;
6643 return;
6644 }
6645 new = PyUnicode_Concat(*pleft, right);
6646 Py_DECREF(*pleft);
6647 *pleft = new;
6648}
6649
6650void
6651PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6652{
6653 PyUnicode_Append(pleft, right);
6654 Py_XDECREF(right);
6655}
6656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006657PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658"S.count(sub[, start[, end]]) -> int\n\
6659\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006660Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006661string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006662interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
6664static PyObject *
6665unicode_count(PyUnicodeObject *self, PyObject *args)
6666{
6667 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006669 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 PyObject *result;
6671
Guido van Rossumb8872e62000-05-09 14:14:27 +00006672 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6673 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 return NULL;
6675
6676 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006677 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 if (substring == NULL)
6679 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006680
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Christian Heimes217cfd12007-12-02 14:31:20 +00006683 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006684 stringlib_count(self->str + start, end - start,
6685 substring->str, substring->length)
6686 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 return result;
6691}
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006694"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006696Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006697to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006698handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6700'xmlcharrefreplace' as well as any other name registered with\n\
6701codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703static PyObject *
6704unicode_encode(PyUnicodeObject *self, PyObject *args)
6705{
6706 char *encoding = NULL;
6707 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006708 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006709
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6711 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006712 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006713 if (v == NULL)
6714 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006715 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006716 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006717 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006718 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006719 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006720 Py_DECREF(v);
6721 return NULL;
6722 }
6723 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006724
6725 onError:
6726 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006727}
6728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006730"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731\n\
6732Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
6736unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6737{
6738 Py_UNICODE *e;
6739 Py_UNICODE *p;
6740 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006741 Py_UNICODE *qe;
6742 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 PyUnicodeObject *u;
6744 int tabsize = 8;
6745
6746 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6747 return NULL;
6748
Thomas Wouters7e474022000-07-16 12:04:32 +00006749 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006750 i = 0; /* chars up to and including most recent \n or \r */
6751 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6752 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 for (p = self->str; p < e; p++)
6754 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006755 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006756 incr = tabsize - (j % tabsize); /* cannot overflow */
6757 if (j > PY_SSIZE_T_MAX - incr)
6758 goto overflow1;
6759 j += incr;
6760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 }
6762 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006763 if (j > PY_SSIZE_T_MAX - 1)
6764 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 j++;
6766 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006767 if (i > PY_SSIZE_T_MAX - j)
6768 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006770 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
6772 }
6773
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006774 if (i > PY_SSIZE_T_MAX - j)
6775 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Second pass: create output string and fill it */
6778 u = _PyUnicode_New(i + j);
6779 if (!u)
6780 return NULL;
6781
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006782 j = 0; /* same as in first pass */
6783 q = u->str; /* next output char */
6784 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
6786 for (p = self->str; p < e; p++)
6787 if (*p == '\t') {
6788 if (tabsize > 0) {
6789 i = tabsize - (j % tabsize);
6790 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006791 while (i--) {
6792 if (q >= qe)
6793 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 }
6797 }
6798 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006799 if (q >= qe)
6800 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006802 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 if (*p == '\n' || *p == '\r')
6804 j = 0;
6805 }
6806
6807 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006808
6809 overflow2:
6810 Py_DECREF(u);
6811 overflow1:
6812 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006817"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818\n\
6819Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006820such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821arguments start and end are interpreted as in slice notation.\n\
6822\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
6825static PyObject *
6826unicode_find(PyUnicodeObject *self, PyObject *args)
6827{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006828 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006829 Py_ssize_t start;
6830 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006831 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
Christian Heimes9cd17752007-11-18 19:35:23 +00006833 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Thomas Wouters477c8d52006-05-27 19:21:47 +00006836 result = stringlib_find_slice(
6837 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6838 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6839 start, end
6840 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
6842 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006843
Christian Heimes217cfd12007-12-02 14:31:20 +00006844 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845}
6846
6847static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006848unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849{
6850 if (index < 0 || index >= self->length) {
6851 PyErr_SetString(PyExc_IndexError, "string index out of range");
6852 return NULL;
6853 }
6854
6855 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6856}
6857
Guido van Rossumc2504932007-09-18 19:42:40 +00006858/* Believe it or not, this produces the same value for ASCII strings
6859 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006861unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
Guido van Rossumc2504932007-09-18 19:42:40 +00006863 Py_ssize_t len;
6864 Py_UNICODE *p;
6865 long x;
6866
6867 if (self->hash != -1)
6868 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006869 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006870 p = self->str;
6871 x = *p << 7;
6872 while (--len >= 0)
6873 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006874 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006875 if (x == -1)
6876 x = -2;
6877 self->hash = x;
6878 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006882"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886static PyObject *
6887unicode_index(PyUnicodeObject *self, PyObject *args)
6888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006889 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006890 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006891 Py_ssize_t start;
6892 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
Christian Heimes9cd17752007-11-18 19:35:23 +00006894 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Thomas Wouters477c8d52006-05-27 19:21:47 +00006897 result = stringlib_find_slice(
6898 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6899 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6900 start, end
6901 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
6903 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 if (result < 0) {
6906 PyErr_SetString(PyExc_ValueError, "substring not found");
6907 return NULL;
6908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006909
Christian Heimes217cfd12007-12-02 14:31:20 +00006910 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911}
6912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006913PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
6919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006920unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921{
6922 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6923 register const Py_UNICODE *e;
6924 int cased;
6925
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 /* Shortcut for single character strings */
6927 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006930 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006931 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006933
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 e = p + PyUnicode_GET_SIZE(self);
6935 cased = 0;
6936 for (; p < e; p++) {
6937 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006938
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 else if (!cased && Py_UNICODE_ISLOWER(ch))
6942 cased = 1;
6943 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945}
6946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006950Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
6953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006954unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955{
6956 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6957 register const Py_UNICODE *e;
6958 int cased;
6959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 /* Shortcut for single character strings */
6961 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006964 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006965 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006966 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006967
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 e = p + PyUnicode_GET_SIZE(self);
6969 cased = 0;
6970 for (; p < e; p++) {
6971 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006972
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 else if (!cased && Py_UNICODE_ISUPPER(ch))
6976 cased = 1;
6977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979}
6980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006984Return True if S is a titlecased string and there is at least one\n\
6985character in S, i.e. upper- and titlecase characters may only\n\
6986follow uncased characters and lowercase characters only cased ones.\n\
6987Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006990unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
6992 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6993 register const Py_UNICODE *e;
6994 int cased, previous_is_cased;
6995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 /* Shortcut for single character strings */
6997 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006998 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6999 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007001 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007002 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007004
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 e = p + PyUnicode_GET_SIZE(self);
7006 cased = 0;
7007 previous_is_cased = 0;
7008 for (; p < e; p++) {
7009 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7012 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 previous_is_cased = 1;
7015 cased = 1;
7016 }
7017 else if (Py_UNICODE_ISLOWER(ch)) {
7018 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 previous_is_cased = 1;
7021 cased = 1;
7022 }
7023 else
7024 previous_is_cased = 0;
7025 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007026 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007030"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007032Return True if all characters in S are whitespace\n\
7033and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
7035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007036unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
7038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7039 register const Py_UNICODE *e;
7040
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 /* Shortcut for single character strings */
7042 if (PyUnicode_GET_SIZE(self) == 1 &&
7043 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007046 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007047 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007049
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 e = p + PyUnicode_GET_SIZE(self);
7051 for (; p < e; p++) {
7052 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007053 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007055 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007059"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007060\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007061Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007063
7064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007065unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007066{
7067 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7068 register const Py_UNICODE *e;
7069
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007070 /* Shortcut for single character strings */
7071 if (PyUnicode_GET_SIZE(self) == 1 &&
7072 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007073 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007074
7075 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007076 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007077 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007078
7079 e = p + PyUnicode_GET_SIZE(self);
7080 for (; p < e; p++) {
7081 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007082 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007083 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007084 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007085}
7086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007088"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007090Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007092
7093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007094unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007095{
7096 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7097 register const Py_UNICODE *e;
7098
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007099 /* Shortcut for single character strings */
7100 if (PyUnicode_GET_SIZE(self) == 1 &&
7101 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007102 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007103
7104 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007105 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007106 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007107
7108 e = p + PyUnicode_GET_SIZE(self);
7109 for (; p < e; p++) {
7110 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007111 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007113 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007114}
7115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007117"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007119Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007123unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
7125 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7126 register const Py_UNICODE *e;
7127
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 /* Shortcut for single character strings */
7129 if (PyUnicode_GET_SIZE(self) == 1 &&
7130 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007131 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007133 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007134 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007135 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 e = p + PyUnicode_GET_SIZE(self);
7138 for (; p < e; p++) {
7139 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007140 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007142 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143}
7144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007146"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007148Return True if all characters in S are digits\n\
7149and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007152unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153{
7154 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7155 register const Py_UNICODE *e;
7156
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 /* Shortcut for single character strings */
7158 if (PyUnicode_GET_SIZE(self) == 1 &&
7159 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007160 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007162 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007163 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007164 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 e = p + PyUnicode_GET_SIZE(self);
7167 for (; p < e; p++) {
7168 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007169 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007171 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172}
7173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007174PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007175"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
7180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007181unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
7183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7184 register const Py_UNICODE *e;
7185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 /* Shortcut for single character strings */
7187 if (PyUnicode_GET_SIZE(self) == 1 &&
7188 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007189 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007191 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007192 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007193 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007194
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 e = p + PyUnicode_GET_SIZE(self);
7196 for (; p < e; p++) {
7197 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007198 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007200 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201}
7202
Martin v. Löwis47383402007-08-15 07:32:56 +00007203int
7204PyUnicode_IsIdentifier(PyObject *self)
7205{
7206 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7207 register const Py_UNICODE *e;
7208
7209 /* Special case for empty strings */
7210 if (PyUnicode_GET_SIZE(self) == 0)
7211 return 0;
7212
7213 /* PEP 3131 says that the first character must be in
7214 XID_Start and subsequent characters in XID_Continue,
7215 and for the ASCII range, the 2.x rules apply (i.e
7216 start with letters and underscore, continue with
7217 letters, digits, underscore). However, given the current
7218 definition of XID_Start and XID_Continue, it is sufficient
7219 to check just for these, except that _ must be allowed
7220 as starting an identifier. */
7221 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7222 return 0;
7223
7224 e = p + PyUnicode_GET_SIZE(self);
7225 for (p++; p < e; p++) {
7226 if (!_PyUnicode_IsXidContinue(*p))
7227 return 0;
7228 }
7229 return 1;
7230}
7231
7232PyDoc_STRVAR(isidentifier__doc__,
7233"S.isidentifier() -> bool\n\
7234\n\
7235Return True if S is a valid identifier according\n\
7236to the language definition.");
7237
7238static PyObject*
7239unicode_isidentifier(PyObject *self)
7240{
7241 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7242}
7243
Georg Brandl559e5d72008-06-11 18:37:52 +00007244PyDoc_STRVAR(isprintable__doc__,
7245"S.isprintable() -> bool\n\
7246\n\
7247Return True if all characters in S are considered\n\
7248printable in repr() or S is empty, False otherwise.");
7249
7250static PyObject*
7251unicode_isprintable(PyObject *self)
7252{
7253 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7254 register const Py_UNICODE *e;
7255
7256 /* Shortcut for single character strings */
7257 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7258 Py_RETURN_TRUE;
7259 }
7260
7261 e = p + PyUnicode_GET_SIZE(self);
7262 for (; p < e; p++) {
7263 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7264 Py_RETURN_FALSE;
7265 }
7266 }
7267 Py_RETURN_TRUE;
7268}
7269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007270PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007271"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272\n\
7273Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007274sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
7276static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007277unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007279 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280}
7281
Martin v. Löwis18e16552006-02-15 17:27:45 +00007282static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283unicode_length(PyUnicodeObject *self)
7284{
7285 return self->length;
7286}
7287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007288PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007289"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007291Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007292done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
7294static PyObject *
7295unicode_ljust(PyUnicodeObject *self, PyObject *args)
7296{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007297 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007298 Py_UNICODE fillchar = ' ';
7299
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007300 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 return NULL;
7302
Tim Peters7a29bd52001-09-12 03:03:31 +00007303 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 Py_INCREF(self);
7305 return (PyObject*) self;
7306 }
7307
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007308 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309}
7310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007311PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007312"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007314Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007317unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 return fixup(self, fixlower);
7320}
7321
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007322#define LEFTSTRIP 0
7323#define RIGHTSTRIP 1
7324#define BOTHSTRIP 2
7325
7326/* Arrays indexed by above */
7327static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7328
7329#define STRIPNAME(i) (stripformat[i]+3)
7330
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007331/* externally visible for str.strip(unicode) */
7332PyObject *
7333_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7334{
7335 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007336 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007337 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007338 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7339 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007340
Thomas Wouters477c8d52006-05-27 19:21:47 +00007341 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7342
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343 i = 0;
7344 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007345 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7346 i++;
7347 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007348 }
7349
7350 j = len;
7351 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007352 do {
7353 j--;
7354 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7355 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007356 }
7357
7358 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007359 Py_INCREF(self);
7360 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007361 }
7362 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007363 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007364}
7365
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
7367static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007368do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007370 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007371 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007372
7373 i = 0;
7374 if (striptype != RIGHTSTRIP) {
7375 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7376 i++;
7377 }
7378 }
7379
7380 j = len;
7381 if (striptype != LEFTSTRIP) {
7382 do {
7383 j--;
7384 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7385 j++;
7386 }
7387
7388 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7389 Py_INCREF(self);
7390 return (PyObject*)self;
7391 }
7392 else
7393 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007396
7397static PyObject *
7398do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7399{
7400 PyObject *sep = NULL;
7401
7402 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7403 return NULL;
7404
7405 if (sep != NULL && sep != Py_None) {
7406 if (PyUnicode_Check(sep))
7407 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007408 else {
7409 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007410 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007411 STRIPNAME(striptype));
7412 return NULL;
7413 }
7414 }
7415
7416 return do_strip(self, striptype);
7417}
7418
7419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007420PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007421"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007422\n\
7423Return a copy of the string S with leading and trailing\n\
7424whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007425If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007426
7427static PyObject *
7428unicode_strip(PyUnicodeObject *self, PyObject *args)
7429{
7430 if (PyTuple_GET_SIZE(args) == 0)
7431 return do_strip(self, BOTHSTRIP); /* Common case */
7432 else
7433 return do_argstrip(self, BOTHSTRIP, args);
7434}
7435
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007438"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007439\n\
7440Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007441If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007442
7443static PyObject *
7444unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7445{
7446 if (PyTuple_GET_SIZE(args) == 0)
7447 return do_strip(self, LEFTSTRIP); /* Common case */
7448 else
7449 return do_argstrip(self, LEFTSTRIP, args);
7450}
7451
7452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007453PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007454"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007455\n\
7456Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007457If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007458
7459static PyObject *
7460unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7461{
7462 if (PyTuple_GET_SIZE(args) == 0)
7463 return do_strip(self, RIGHTSTRIP); /* Common case */
7464 else
7465 return do_argstrip(self, RIGHTSTRIP, args);
7466}
7467
7468
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007470unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
7472 PyUnicodeObject *u;
7473 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007474 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007475 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477 if (len < 0)
7478 len = 0;
7479
Tim Peters7a29bd52001-09-12 03:03:31 +00007480 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 /* no repeat, return original string */
7482 Py_INCREF(str);
7483 return (PyObject*) str;
7484 }
Tim Peters8f422462000-09-09 06:13:41 +00007485
7486 /* ensure # of chars needed doesn't overflow int and # of bytes
7487 * needed doesn't overflow size_t
7488 */
7489 nchars = len * str->length;
7490 if (len && nchars / len != str->length) {
7491 PyErr_SetString(PyExc_OverflowError,
7492 "repeated string is too long");
7493 return NULL;
7494 }
7495 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7496 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7497 PyErr_SetString(PyExc_OverflowError,
7498 "repeated string is too long");
7499 return NULL;
7500 }
7501 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 if (!u)
7503 return NULL;
7504
7505 p = u->str;
7506
Thomas Wouters477c8d52006-05-27 19:21:47 +00007507 if (str->length == 1 && len > 0) {
7508 Py_UNICODE_FILL(p, str->str[0], len);
7509 } else {
7510 Py_ssize_t done = 0; /* number of characters copied this far */
7511 if (done < nchars) {
7512 Py_UNICODE_COPY(p, str->str, str->length);
7513 done = str->length;
7514 }
7515 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007516 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007517 Py_UNICODE_COPY(p+done, p, n);
7518 done += n;
7519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 }
7521
7522 return (PyObject*) u;
7523}
7524
7525PyObject *PyUnicode_Replace(PyObject *obj,
7526 PyObject *subobj,
7527 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529{
7530 PyObject *self;
7531 PyObject *str1;
7532 PyObject *str2;
7533 PyObject *result;
7534
7535 self = PyUnicode_FromObject(obj);
7536 if (self == NULL)
7537 return NULL;
7538 str1 = PyUnicode_FromObject(subobj);
7539 if (str1 == NULL) {
7540 Py_DECREF(self);
7541 return NULL;
7542 }
7543 str2 = PyUnicode_FromObject(replobj);
7544 if (str2 == NULL) {
7545 Py_DECREF(self);
7546 Py_DECREF(str1);
7547 return NULL;
7548 }
Tim Petersced69f82003-09-16 20:30:58 +00007549 result = replace((PyUnicodeObject *)self,
7550 (PyUnicodeObject *)str1,
7551 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 maxcount);
7553 Py_DECREF(self);
7554 Py_DECREF(str1);
7555 Py_DECREF(str2);
7556 return result;
7557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007560"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
7562Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007563old replaced by new. If the optional argument count is\n\
7564given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
7566static PyObject*
7567unicode_replace(PyUnicodeObject *self, PyObject *args)
7568{
7569 PyUnicodeObject *str1;
7570 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007571 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 PyObject *result;
7573
Martin v. Löwis18e16552006-02-15 17:27:45 +00007574 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 return NULL;
7576 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7577 if (str1 == NULL)
7578 return NULL;
7579 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007580 if (str2 == NULL) {
7581 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585 result = replace(self, str1, str2, maxcount);
7586
7587 Py_DECREF(str1);
7588 Py_DECREF(str2);
7589 return result;
7590}
7591
7592static
7593PyObject *unicode_repr(PyObject *unicode)
7594{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007595 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007596 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007597 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7598 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7599
7600 /* XXX(nnorwitz): rather than over-allocating, it would be
7601 better to choose a different scheme. Perhaps scan the
7602 first N-chars of the string and allocate based on that size.
7603 */
7604 /* Initial allocation is based on the longest-possible unichr
7605 escape.
7606
7607 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7608 unichr, so in this case it's the longest unichr escape. In
7609 narrow (UTF-16) builds this is five chars per source unichr
7610 since there are two unichrs in the surrogate pair, so in narrow
7611 (UTF-16) builds it's not the longest unichr escape.
7612
7613 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7614 so in the narrow (UTF-16) build case it's the longest unichr
7615 escape.
7616 */
7617
Walter Dörwald1ab83302007-05-18 17:15:44 +00007618 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007619 2 /* quotes */
7620#ifdef Py_UNICODE_WIDE
7621 + 10*size
7622#else
7623 + 6*size
7624#endif
7625 + 1);
7626 if (repr == NULL)
7627 return NULL;
7628
Walter Dörwald1ab83302007-05-18 17:15:44 +00007629 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007630
7631 /* Add quote */
7632 *p++ = (findchar(s, size, '\'') &&
7633 !findchar(s, size, '"')) ? '"' : '\'';
7634 while (size-- > 0) {
7635 Py_UNICODE ch = *s++;
7636
7637 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007638 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007639 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007640 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007641 continue;
7642 }
7643
Georg Brandl559e5d72008-06-11 18:37:52 +00007644 /* Map special whitespace to '\t', \n', '\r' */
7645 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007646 *p++ = '\\';
7647 *p++ = 't';
7648 }
7649 else if (ch == '\n') {
7650 *p++ = '\\';
7651 *p++ = 'n';
7652 }
7653 else if (ch == '\r') {
7654 *p++ = '\\';
7655 *p++ = 'r';
7656 }
7657
7658 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007659 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007660 *p++ = '\\';
7661 *p++ = 'x';
7662 *p++ = hexdigits[(ch >> 4) & 0x000F];
7663 *p++ = hexdigits[ch & 0x000F];
7664 }
7665
Georg Brandl559e5d72008-06-11 18:37:52 +00007666 /* Copy ASCII characters as-is */
7667 else if (ch < 0x7F) {
7668 *p++ = ch;
7669 }
7670
7671 /* Non-ASCII characters */
7672 else {
7673 Py_UCS4 ucs = ch;
7674
7675#ifndef Py_UNICODE_WIDE
7676 Py_UNICODE ch2 = 0;
7677 /* Get code point from surrogate pair */
7678 if (size > 0) {
7679 ch2 = *s;
7680 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7681 && ch2 <= 0xDFFF) {
7682 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7683 + 0x00010000;
7684 s++;
7685 size--;
7686 }
7687 }
7688#endif
7689 /* Map Unicode whitespace and control characters
7690 (categories Z* and C* except ASCII space)
7691 */
7692 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7693 /* Map 8-bit characters to '\xhh' */
7694 if (ucs <= 0xff) {
7695 *p++ = '\\';
7696 *p++ = 'x';
7697 *p++ = hexdigits[(ch >> 4) & 0x000F];
7698 *p++ = hexdigits[ch & 0x000F];
7699 }
7700 /* Map 21-bit characters to '\U00xxxxxx' */
7701 else if (ucs >= 0x10000) {
7702 *p++ = '\\';
7703 *p++ = 'U';
7704 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7705 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7706 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7707 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7708 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7709 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7710 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7711 *p++ = hexdigits[ucs & 0x0000000F];
7712 }
7713 /* Map 16-bit characters to '\uxxxx' */
7714 else {
7715 *p++ = '\\';
7716 *p++ = 'u';
7717 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7718 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7719 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7720 *p++ = hexdigits[ucs & 0x000F];
7721 }
7722 }
7723 /* Copy characters as-is */
7724 else {
7725 *p++ = ch;
7726#ifndef Py_UNICODE_WIDE
7727 if (ucs >= 0x10000)
7728 *p++ = ch2;
7729#endif
7730 }
7731 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007732 }
7733 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007734 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007735
7736 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007737 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007738 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007742"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
7744Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007745such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746arguments start and end are interpreted as in slice notation.\n\
7747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750static PyObject *
7751unicode_rfind(PyUnicodeObject *self, PyObject *args)
7752{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007753 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007754 Py_ssize_t start;
7755 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007756 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Christian Heimes9cd17752007-11-18 19:35:23 +00007758 if (!_ParseTupleFinds(args, &substring, &start, &end))
7759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
Thomas Wouters477c8d52006-05-27 19:21:47 +00007761 result = stringlib_rfind_slice(
7762 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7763 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7764 start, end
7765 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
7767 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007768
Christian Heimes217cfd12007-12-02 14:31:20 +00007769 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770}
7771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007772PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007773"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007775Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777static PyObject *
7778unicode_rindex(PyUnicodeObject *self, PyObject *args)
7779{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007780 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007781 Py_ssize_t start;
7782 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007783 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
Christian Heimes9cd17752007-11-18 19:35:23 +00007785 if (!_ParseTupleFinds(args, &substring, &start, &end))
7786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
Thomas Wouters477c8d52006-05-27 19:21:47 +00007788 result = stringlib_rfind_slice(
7789 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7790 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7791 start, end
7792 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
7794 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 if (result < 0) {
7797 PyErr_SetString(PyExc_ValueError, "substring not found");
7798 return NULL;
7799 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007800 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801}
7802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007803PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007804"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007806Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007807done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809static PyObject *
7810unicode_rjust(PyUnicodeObject *self, PyObject *args)
7811{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007812 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007813 Py_UNICODE fillchar = ' ';
7814
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007815 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 return NULL;
7817
Tim Peters7a29bd52001-09-12 03:03:31 +00007818 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 Py_INCREF(self);
7820 return (PyObject*) self;
7821 }
7822
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007823 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824}
7825
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826PyObject *PyUnicode_Split(PyObject *s,
7827 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007828 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829{
7830 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 s = PyUnicode_FromObject(s);
7833 if (s == NULL)
7834 return NULL;
7835 if (sep != NULL) {
7836 sep = PyUnicode_FromObject(sep);
7837 if (sep == NULL) {
7838 Py_DECREF(s);
7839 return NULL;
7840 }
7841 }
7842
7843 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7844
7845 Py_DECREF(s);
7846 Py_XDECREF(sep);
7847 return result;
7848}
7849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007850PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007851"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852\n\
7853Return a list of the words in S, using sep as the\n\
7854delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007855splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007856whitespace string is a separator and empty strings are\n\
7857removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858
7859static PyObject*
7860unicode_split(PyUnicodeObject *self, PyObject *args)
7861{
7862 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
Martin v. Löwis18e16552006-02-15 17:27:45 +00007865 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 return NULL;
7867
7868 if (substring == Py_None)
7869 return split(self, NULL, maxcount);
7870 else if (PyUnicode_Check(substring))
7871 return split(self, (PyUnicodeObject *)substring, maxcount);
7872 else
7873 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7874}
7875
Thomas Wouters477c8d52006-05-27 19:21:47 +00007876PyObject *
7877PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7878{
7879 PyObject* str_obj;
7880 PyObject* sep_obj;
7881 PyObject* out;
7882
7883 str_obj = PyUnicode_FromObject(str_in);
7884 if (!str_obj)
7885 return NULL;
7886 sep_obj = PyUnicode_FromObject(sep_in);
7887 if (!sep_obj) {
7888 Py_DECREF(str_obj);
7889 return NULL;
7890 }
7891
7892 out = stringlib_partition(
7893 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7894 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7895 );
7896
7897 Py_DECREF(sep_obj);
7898 Py_DECREF(str_obj);
7899
7900 return out;
7901}
7902
7903
7904PyObject *
7905PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7906{
7907 PyObject* str_obj;
7908 PyObject* sep_obj;
7909 PyObject* out;
7910
7911 str_obj = PyUnicode_FromObject(str_in);
7912 if (!str_obj)
7913 return NULL;
7914 sep_obj = PyUnicode_FromObject(sep_in);
7915 if (!sep_obj) {
7916 Py_DECREF(str_obj);
7917 return NULL;
7918 }
7919
7920 out = stringlib_rpartition(
7921 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7922 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7923 );
7924
7925 Py_DECREF(sep_obj);
7926 Py_DECREF(str_obj);
7927
7928 return out;
7929}
7930
7931PyDoc_STRVAR(partition__doc__,
7932"S.partition(sep) -> (head, sep, tail)\n\
7933\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007934Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007935the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007936found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007937
7938static PyObject*
7939unicode_partition(PyUnicodeObject *self, PyObject *separator)
7940{
7941 return PyUnicode_Partition((PyObject *)self, separator);
7942}
7943
7944PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007945"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007946\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007947Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007948the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007949separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007950
7951static PyObject*
7952unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7953{
7954 return PyUnicode_RPartition((PyObject *)self, separator);
7955}
7956
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007957PyObject *PyUnicode_RSplit(PyObject *s,
7958 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007960{
7961 PyObject *result;
7962
7963 s = PyUnicode_FromObject(s);
7964 if (s == NULL)
7965 return NULL;
7966 if (sep != NULL) {
7967 sep = PyUnicode_FromObject(sep);
7968 if (sep == NULL) {
7969 Py_DECREF(s);
7970 return NULL;
7971 }
7972 }
7973
7974 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7975
7976 Py_DECREF(s);
7977 Py_XDECREF(sep);
7978 return result;
7979}
7980
7981PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007982"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007983\n\
7984Return a list of the words in S, using sep as the\n\
7985delimiter string, starting at the end of the string and\n\
7986working to the front. If maxsplit is given, at most maxsplit\n\
7987splits are done. If sep is not specified, any whitespace string\n\
7988is a separator.");
7989
7990static PyObject*
7991unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7992{
7993 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007994 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007995
Martin v. Löwis18e16552006-02-15 17:27:45 +00007996 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007997 return NULL;
7998
7999 if (substring == Py_None)
8000 return rsplit(self, NULL, maxcount);
8001 else if (PyUnicode_Check(substring))
8002 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8003 else
8004 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson4469d0c2008-11-30 22:46:23 +00008008"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009\n\
8010Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008011Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008012is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013
8014static PyObject*
8015unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8016{
Guido van Rossum86662912000-04-11 15:38:46 +00008017 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Guido van Rossum86662912000-04-11 15:38:46 +00008019 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 return NULL;
8021
Guido van Rossum86662912000-04-11 15:38:46 +00008022 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023}
8024
8025static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008026PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027{
Walter Dörwald346737f2007-05-31 10:44:43 +00008028 if (PyUnicode_CheckExact(self)) {
8029 Py_INCREF(self);
8030 return self;
8031 } else
8032 /* Subtype -- return genuine unicode string with the same value. */
8033 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8034 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035}
8036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008038"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039\n\
8040Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008041and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042
8043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008044unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 return fixup(self, fixswapcase);
8047}
8048
Georg Brandlceee0772007-11-27 23:48:05 +00008049PyDoc_STRVAR(maketrans__doc__,
8050"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8051\n\
8052Return a translation table usable for str.translate().\n\
8053If there is only one argument, it must be a dictionary mapping Unicode\n\
8054ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008055Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008056If there are two arguments, they must be strings of equal length, and\n\
8057in the resulting dictionary, each character in x will be mapped to the\n\
8058character at the same position in y. If there is a third argument, it\n\
8059must be a string, whose characters will be mapped to None in the result.");
8060
8061static PyObject*
8062unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8063{
8064 PyObject *x, *y = NULL, *z = NULL;
8065 PyObject *new = NULL, *key, *value;
8066 Py_ssize_t i = 0;
8067 int res;
8068
8069 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8070 return NULL;
8071 new = PyDict_New();
8072 if (!new)
8073 return NULL;
8074 if (y != NULL) {
8075 /* x must be a string too, of equal length */
8076 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8077 if (!PyUnicode_Check(x)) {
8078 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8079 "be a string if there is a second argument");
8080 goto err;
8081 }
8082 if (PyUnicode_GET_SIZE(x) != ylen) {
8083 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8084 "arguments must have equal length");
8085 goto err;
8086 }
8087 /* create entries for translating chars in x to those in y */
8088 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008089 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8090 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008091 if (!key || !value)
8092 goto err;
8093 res = PyDict_SetItem(new, key, value);
8094 Py_DECREF(key);
8095 Py_DECREF(value);
8096 if (res < 0)
8097 goto err;
8098 }
8099 /* create entries for deleting chars in z */
8100 if (z != NULL) {
8101 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008102 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008103 if (!key)
8104 goto err;
8105 res = PyDict_SetItem(new, key, Py_None);
8106 Py_DECREF(key);
8107 if (res < 0)
8108 goto err;
8109 }
8110 }
8111 } else {
8112 /* x must be a dict */
8113 if (!PyDict_Check(x)) {
8114 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8115 "to maketrans it must be a dict");
8116 goto err;
8117 }
8118 /* copy entries into the new dict, converting string keys to int keys */
8119 while (PyDict_Next(x, &i, &key, &value)) {
8120 if (PyUnicode_Check(key)) {
8121 /* convert string keys to integer keys */
8122 PyObject *newkey;
8123 if (PyUnicode_GET_SIZE(key) != 1) {
8124 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8125 "table must be of length 1");
8126 goto err;
8127 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008128 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008129 if (!newkey)
8130 goto err;
8131 res = PyDict_SetItem(new, newkey, value);
8132 Py_DECREF(newkey);
8133 if (res < 0)
8134 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008135 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008136 /* just keep integer keys */
8137 if (PyDict_SetItem(new, key, value) < 0)
8138 goto err;
8139 } else {
8140 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8141 "be strings or integers");
8142 goto err;
8143 }
8144 }
8145 }
8146 return new;
8147 err:
8148 Py_DECREF(new);
8149 return NULL;
8150}
8151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008152PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008153"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154\n\
8155Return a copy of the string S, where all characters have been mapped\n\
8156through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008157Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008158Unmapped characters are left untouched. Characters mapped to None\n\
8159are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
8161static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008162unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163{
Georg Brandlceee0772007-11-27 23:48:05 +00008164 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165}
8166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008167PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008168"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008170Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
8172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008173unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 return fixup(self, fixupper);
8176}
8177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008178PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008179"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008181Pad a numeric string S with zeros on the left, to fill a field\n\
8182of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183
8184static PyObject *
8185unicode_zfill(PyUnicodeObject *self, PyObject *args)
8186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008187 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 PyUnicodeObject *u;
8189
Martin v. Löwis18e16552006-02-15 17:27:45 +00008190 Py_ssize_t width;
8191 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 return NULL;
8193
8194 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008195 if (PyUnicode_CheckExact(self)) {
8196 Py_INCREF(self);
8197 return (PyObject*) self;
8198 }
8199 else
8200 return PyUnicode_FromUnicode(
8201 PyUnicode_AS_UNICODE(self),
8202 PyUnicode_GET_SIZE(self)
8203 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 }
8205
8206 fill = width - self->length;
8207
8208 u = pad(self, fill, 0, '0');
8209
Walter Dörwald068325e2002-04-15 13:36:47 +00008210 if (u == NULL)
8211 return NULL;
8212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 if (u->str[fill] == '+' || u->str[fill] == '-') {
8214 /* move sign to beginning of string */
8215 u->str[0] = u->str[fill];
8216 u->str[fill] = '0';
8217 }
8218
8219 return (PyObject*) u;
8220}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221
8222#if 0
8223static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008224unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225{
Christian Heimes2202f872008-02-06 14:31:34 +00008226 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227}
8228#endif
8229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008230PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008231"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008233Return True if S starts with the specified prefix, False otherwise.\n\
8234With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008235With optional end, stop comparing S at that position.\n\
8236prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237
8238static PyObject *
8239unicode_startswith(PyUnicodeObject *self,
8240 PyObject *args)
8241{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008242 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008244 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008245 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008246 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008248 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008249 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008251 if (PyTuple_Check(subobj)) {
8252 Py_ssize_t i;
8253 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8254 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8255 PyTuple_GET_ITEM(subobj, i));
8256 if (substring == NULL)
8257 return NULL;
8258 result = tailmatch(self, substring, start, end, -1);
8259 Py_DECREF(substring);
8260 if (result) {
8261 Py_RETURN_TRUE;
8262 }
8263 }
8264 /* nothing matched */
8265 Py_RETURN_FALSE;
8266 }
8267 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008269 return NULL;
8270 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008272 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273}
8274
8275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008276PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008277"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008279Return True if S ends with the specified suffix, False otherwise.\n\
8280With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008281With optional end, stop comparing S at that position.\n\
8282suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283
8284static PyObject *
8285unicode_endswith(PyUnicodeObject *self,
8286 PyObject *args)
8287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008288 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008290 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008291 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008292 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008294 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8295 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008297 if (PyTuple_Check(subobj)) {
8298 Py_ssize_t i;
8299 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8300 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8301 PyTuple_GET_ITEM(subobj, i));
8302 if (substring == NULL)
8303 return NULL;
8304 result = tailmatch(self, substring, start, end, +1);
8305 Py_DECREF(substring);
8306 if (result) {
8307 Py_RETURN_TRUE;
8308 }
8309 }
8310 Py_RETURN_FALSE;
8311 }
8312 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008316 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008318 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
Eric Smith8c663262007-08-25 02:26:07 +00008321#include "stringlib/string_format.h"
8322
8323PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008324"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008325\n\
8326");
8327
Eric Smith4a7d76d2008-05-30 18:10:19 +00008328static PyObject *
8329unicode__format__(PyObject* self, PyObject* args)
8330{
8331 PyObject *format_spec;
8332
8333 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8334 return NULL;
8335
8336 return _PyUnicode_FormatAdvanced(self,
8337 PyUnicode_AS_UNICODE(format_spec),
8338 PyUnicode_GET_SIZE(format_spec));
8339}
8340
Eric Smith8c663262007-08-25 02:26:07 +00008341PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008342"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008343\n\
8344");
8345
8346static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008347unicode__sizeof__(PyUnicodeObject *v)
8348{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008349 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8350 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008351}
8352
8353PyDoc_STRVAR(sizeof__doc__,
8354"S.__sizeof__() -> size of S in memory, in bytes");
8355
8356static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008357unicode_getnewargs(PyUnicodeObject *v)
8358{
8359 return Py_BuildValue("(u#)", v->str, v->length);
8360}
8361
8362
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363static PyMethodDef unicode_methods[] = {
8364
8365 /* Order is according to common usage: often used methods should
8366 appear first, since lookup is done sequentially. */
8367
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008368 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8369 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8370 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008371 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008372 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8373 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8374 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8375 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8376 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8377 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8378 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008380 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8381 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8382 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008383 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008384 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8385 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8386 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008387 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008388 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008389 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008390 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008391 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8392 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8393 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8394 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8395 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8396 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8397 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8398 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8399 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8400 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8401 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8402 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8403 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8404 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008405 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008406 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008407 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008408 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008409 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008410 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8411 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008412 {"maketrans", (PyCFunction) unicode_maketrans,
8413 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008414 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008415#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008416 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417#endif
8418
8419#if 0
8420 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008421 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422#endif
8423
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008424 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 {NULL, NULL}
8426};
8427
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008428static PyObject *
8429unicode_mod(PyObject *v, PyObject *w)
8430{
8431 if (!PyUnicode_Check(v)) {
8432 Py_INCREF(Py_NotImplemented);
8433 return Py_NotImplemented;
8434 }
8435 return PyUnicode_Format(v, w);
8436}
8437
8438static PyNumberMethods unicode_as_number = {
8439 0, /*nb_add*/
8440 0, /*nb_subtract*/
8441 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008442 unicode_mod, /*nb_remainder*/
8443};
8444
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008446 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008447 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008448 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8449 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008450 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 0, /* sq_ass_item */
8452 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008453 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454};
8455
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008456static PyObject*
8457unicode_subscript(PyUnicodeObject* self, PyObject* item)
8458{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008459 if (PyIndex_Check(item)) {
8460 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008461 if (i == -1 && PyErr_Occurred())
8462 return NULL;
8463 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008464 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008465 return unicode_getitem(self, i);
8466 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008467 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008468 Py_UNICODE* source_buf;
8469 Py_UNICODE* result_buf;
8470 PyObject* result;
8471
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008472 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008473 &start, &stop, &step, &slicelength) < 0) {
8474 return NULL;
8475 }
8476
8477 if (slicelength <= 0) {
8478 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008479 } else if (start == 0 && step == 1 && slicelength == self->length &&
8480 PyUnicode_CheckExact(self)) {
8481 Py_INCREF(self);
8482 return (PyObject *)self;
8483 } else if (step == 1) {
8484 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008485 } else {
8486 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008487 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8488 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008489
8490 if (result_buf == NULL)
8491 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008492
8493 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8494 result_buf[i] = source_buf[cur];
8495 }
Tim Petersced69f82003-09-16 20:30:58 +00008496
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008497 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008498 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008499 return result;
8500 }
8501 } else {
8502 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8503 return NULL;
8504 }
8505}
8506
8507static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008508 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008509 (binaryfunc)unicode_subscript, /* mp_subscript */
8510 (objobjargproc)0, /* mp_ass_subscript */
8511};
8512
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514/* Helpers for PyUnicode_Format() */
8515
8516static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008517getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008519 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 if (argidx < arglen) {
8521 (*p_argidx)++;
8522 if (arglen < 0)
8523 return args;
8524 else
8525 return PyTuple_GetItem(args, argidx);
8526 }
8527 PyErr_SetString(PyExc_TypeError,
8528 "not enough arguments for format string");
8529 return NULL;
8530}
8531
Martin v. Löwis18e16552006-02-15 17:27:45 +00008532static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008533strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 register Py_ssize_t i;
8536 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 for (i = len - 1; i >= 0; i--)
8538 buffer[i] = (Py_UNICODE) charbuffer[i];
8539
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 return len;
8541}
8542
Neal Norwitzfc76d632006-01-10 06:03:13 +00008543static int
8544doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8545{
Tim Peters15231542006-02-16 01:08:01 +00008546 Py_ssize_t result;
8547
Neal Norwitzfc76d632006-01-10 06:03:13 +00008548 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008549 result = strtounicode(buffer, (char *)buffer);
8550 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008551}
8552
Christian Heimes3fd13992008-03-21 01:05:49 +00008553#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008554static int
8555longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8556{
Tim Peters15231542006-02-16 01:08:01 +00008557 Py_ssize_t result;
8558
Neal Norwitzfc76d632006-01-10 06:03:13 +00008559 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008560 result = strtounicode(buffer, (char *)buffer);
8561 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008562}
Christian Heimes3fd13992008-03-21 01:05:49 +00008563#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008564
Guido van Rossum078151d2002-08-11 04:24:12 +00008565/* XXX To save some code duplication, formatfloat/long/int could have been
8566 shared with stringobject.c, converting from 8-bit to Unicode after the
8567 formatting is done. */
8568
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569static int
8570formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008571 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 int flags,
8573 int prec,
8574 int type,
8575 PyObject *v)
8576{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008577 /* fmt = '%#.' + `prec` + `type`
8578 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 char fmt[20];
8580 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008581
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 x = PyFloat_AsDouble(v);
8583 if (x == -1.0 && PyErr_Occurred())
8584 return -1;
8585 if (prec < 0)
8586 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008587 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8588 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008589 /* Worst case length calc to ensure no buffer overrun:
8590
8591 'g' formats:
8592 fmt = %#.<prec>g
8593 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8594 for any double rep.)
8595 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8596
8597 'f' formats:
8598 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8599 len = 1 + 50 + 1 + prec = 52 + prec
8600
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008601 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008602 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008603
8604 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008605 if (((type == 'g' || type == 'G') &&
8606 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008607 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008608 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008609 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008610 return -1;
8611 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008612 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8613 (flags&F_ALT) ? "#" : "",
8614 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008615 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Tim Peters38fd5b62000-09-21 05:43:11 +00008618static PyObject*
8619formatlong(PyObject *val, int flags, int prec, int type)
8620{
8621 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008622 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008623 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008624 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008625
Christian Heimes72b710a2008-05-26 13:28:38 +00008626 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008627 if (!str)
8628 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008629 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008630 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008631 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008632}
8633
Christian Heimes3fd13992008-03-21 01:05:49 +00008634#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635static int
8636formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008637 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 int flags,
8639 int prec,
8640 int type,
8641 PyObject *v)
8642{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008643 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008644 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8645 * + 1 + 1
8646 * = 24
8647 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008648 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008649 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 long x;
8651
Christian Heimes217cfd12007-12-02 14:31:20 +00008652 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008654 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008655 if (x < 0 && type == 'u') {
8656 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008657 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008658 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8659 sign = "-";
8660 else
8661 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008663 prec = 1;
8664
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008665 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8666 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008667 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008668 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008669 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008670 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008671 return -1;
8672 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008673
8674 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008675 (type == 'x' || type == 'X' || type == 'o')) {
8676 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008677 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008678 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679 * - when 0 is being converted, the C standard leaves off
8680 * the '0x' or '0X', which is inconsistent with other
8681 * %#x/%#X conversions and inconsistent with Python's
8682 * hex() function
8683 * - there are platforms that violate the standard and
8684 * convert 0 with the '0x' or '0X'
8685 * (Metrowerks, Compaq Tru64)
8686 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008687 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008688 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008689 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008690 * We can achieve the desired consistency by inserting our
8691 * own '0x' or '0X' prefix, and substituting %x/%X in place
8692 * of %#x/%#X.
8693 *
8694 * Note that this is the same approach as used in
8695 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008696 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008697 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8698 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008699 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008700 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008701 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8702 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008703 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008704 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008705 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008706 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008707 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008708 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709}
Christian Heimes3fd13992008-03-21 01:05:49 +00008710#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
8712static int
8713formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008714 size_t buflen,
8715 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008717 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008718 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008719 if (PyUnicode_GET_SIZE(v) == 1) {
8720 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8721 buf[1] = '\0';
8722 return 1;
8723 }
8724#ifndef Py_UNICODE_WIDE
8725 if (PyUnicode_GET_SIZE(v) == 2) {
8726 /* Decode a valid surrogate pair */
8727 int c0 = PyUnicode_AS_UNICODE(v)[0];
8728 int c1 = PyUnicode_AS_UNICODE(v)[1];
8729 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8730 0xDC00 <= c1 && c1 <= 0xDFFF) {
8731 buf[0] = c0;
8732 buf[1] = c1;
8733 buf[2] = '\0';
8734 return 2;
8735 }
8736 }
8737#endif
8738 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 else {
8741 /* Integer input truncated to a character */
8742 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008743 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008745 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008746
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008747 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008748 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008749 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008750 return -1;
8751 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008752
8753#ifndef Py_UNICODE_WIDE
8754 if (x > 0xffff) {
8755 x -= 0x10000;
8756 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8757 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8758 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008759 }
8760#endif
8761 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008762 buf[1] = '\0';
8763 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008765
8766 onError:
8767 PyErr_SetString(PyExc_TypeError,
8768 "%c requires int or char");
8769 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770}
8771
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008772/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8773
8774 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8775 chars are formatted. XXX This is a magic number. Each formatting
8776 routine does bounds checking to ensure no overflow, but a better
8777 solution may be to malloc a buffer of appropriate size for each
8778 format. For now, the current solution is sufficient.
8779*/
8780#define FORMATBUFLEN (size_t)120
8781
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782PyObject *PyUnicode_Format(PyObject *format,
8783 PyObject *args)
8784{
8785 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008786 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 int args_owned = 0;
8788 PyUnicodeObject *result = NULL;
8789 PyObject *dict = NULL;
8790 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008791
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 if (format == NULL || args == NULL) {
8793 PyErr_BadInternalCall();
8794 return NULL;
8795 }
8796 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008797 if (uformat == NULL)
8798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 fmt = PyUnicode_AS_UNICODE(uformat);
8800 fmtcnt = PyUnicode_GET_SIZE(uformat);
8801
8802 reslen = rescnt = fmtcnt + 100;
8803 result = _PyUnicode_New(reslen);
8804 if (result == NULL)
8805 goto onError;
8806 res = PyUnicode_AS_UNICODE(result);
8807
8808 if (PyTuple_Check(args)) {
8809 arglen = PyTuple_Size(args);
8810 argidx = 0;
8811 }
8812 else {
8813 arglen = -1;
8814 argidx = -2;
8815 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008816 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008817 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 dict = args;
8819
8820 while (--fmtcnt >= 0) {
8821 if (*fmt != '%') {
8822 if (--rescnt < 0) {
8823 rescnt = fmtcnt + 100;
8824 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008825 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008826 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8828 --rescnt;
8829 }
8830 *res++ = *fmt++;
8831 }
8832 else {
8833 /* Got a format specifier */
8834 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008835 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837 Py_UNICODE c = '\0';
8838 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008839 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 PyObject *v = NULL;
8841 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008842 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008845 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846
8847 fmt++;
8848 if (*fmt == '(') {
8849 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 PyObject *key;
8852 int pcount = 1;
8853
8854 if (dict == NULL) {
8855 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008856 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 goto onError;
8858 }
8859 ++fmt;
8860 --fmtcnt;
8861 keystart = fmt;
8862 /* Skip over balanced parentheses */
8863 while (pcount > 0 && --fmtcnt >= 0) {
8864 if (*fmt == ')')
8865 --pcount;
8866 else if (*fmt == '(')
8867 ++pcount;
8868 fmt++;
8869 }
8870 keylen = fmt - keystart - 1;
8871 if (fmtcnt < 0 || pcount > 0) {
8872 PyErr_SetString(PyExc_ValueError,
8873 "incomplete format key");
8874 goto onError;
8875 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008876#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008877 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 then looked up since Python uses strings to hold
8879 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008880 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 key = PyUnicode_EncodeUTF8(keystart,
8882 keylen,
8883 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008884#else
8885 key = PyUnicode_FromUnicode(keystart, keylen);
8886#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 if (key == NULL)
8888 goto onError;
8889 if (args_owned) {
8890 Py_DECREF(args);
8891 args_owned = 0;
8892 }
8893 args = PyObject_GetItem(dict, key);
8894 Py_DECREF(key);
8895 if (args == NULL) {
8896 goto onError;
8897 }
8898 args_owned = 1;
8899 arglen = -1;
8900 argidx = -2;
8901 }
8902 while (--fmtcnt >= 0) {
8903 switch (c = *fmt++) {
8904 case '-': flags |= F_LJUST; continue;
8905 case '+': flags |= F_SIGN; continue;
8906 case ' ': flags |= F_BLANK; continue;
8907 case '#': flags |= F_ALT; continue;
8908 case '0': flags |= F_ZERO; continue;
8909 }
8910 break;
8911 }
8912 if (c == '*') {
8913 v = getnextarg(args, arglen, &argidx);
8914 if (v == NULL)
8915 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008916 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 PyErr_SetString(PyExc_TypeError,
8918 "* wants int");
8919 goto onError;
8920 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008921 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008922 if (width == -1 && PyErr_Occurred())
8923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 if (width < 0) {
8925 flags |= F_LJUST;
8926 width = -width;
8927 }
8928 if (--fmtcnt >= 0)
8929 c = *fmt++;
8930 }
8931 else if (c >= '0' && c <= '9') {
8932 width = c - '0';
8933 while (--fmtcnt >= 0) {
8934 c = *fmt++;
8935 if (c < '0' || c > '9')
8936 break;
8937 if ((width*10) / 10 != width) {
8938 PyErr_SetString(PyExc_ValueError,
8939 "width too big");
8940 goto onError;
8941 }
8942 width = width*10 + (c - '0');
8943 }
8944 }
8945 if (c == '.') {
8946 prec = 0;
8947 if (--fmtcnt >= 0)
8948 c = *fmt++;
8949 if (c == '*') {
8950 v = getnextarg(args, arglen, &argidx);
8951 if (v == NULL)
8952 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008953 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 PyErr_SetString(PyExc_TypeError,
8955 "* wants int");
8956 goto onError;
8957 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008959 if (prec == -1 && PyErr_Occurred())
8960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 if (prec < 0)
8962 prec = 0;
8963 if (--fmtcnt >= 0)
8964 c = *fmt++;
8965 }
8966 else if (c >= '0' && c <= '9') {
8967 prec = c - '0';
8968 while (--fmtcnt >= 0) {
8969 c = Py_CHARMASK(*fmt++);
8970 if (c < '0' || c > '9')
8971 break;
8972 if ((prec*10) / 10 != prec) {
8973 PyErr_SetString(PyExc_ValueError,
8974 "prec too big");
8975 goto onError;
8976 }
8977 prec = prec*10 + (c - '0');
8978 }
8979 }
8980 } /* prec */
8981 if (fmtcnt >= 0) {
8982 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 if (--fmtcnt >= 0)
8984 c = *fmt++;
8985 }
8986 }
8987 if (fmtcnt < 0) {
8988 PyErr_SetString(PyExc_ValueError,
8989 "incomplete format");
8990 goto onError;
8991 }
8992 if (c != '%') {
8993 v = getnextarg(args, arglen, &argidx);
8994 if (v == NULL)
8995 goto onError;
8996 }
8997 sign = 0;
8998 fill = ' ';
8999 switch (c) {
9000
9001 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009002 pbuf = formatbuf;
9003 /* presume that buffer length is at least 1 */
9004 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 len = 1;
9006 break;
9007
9008 case 's':
9009 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009010 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 if (PyUnicode_Check(v) && c == 's') {
9012 temp = v;
9013 Py_INCREF(temp);
9014 }
9015 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009017 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009018 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009020 else
9021 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (temp == NULL)
9023 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009024 if (PyUnicode_Check(temp))
9025 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009026 else {
9027 Py_DECREF(temp);
9028 PyErr_SetString(PyExc_TypeError,
9029 "%s argument has non-string str()");
9030 goto onError;
9031 }
9032 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009033 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 len = PyUnicode_GET_SIZE(temp);
9035 if (prec >= 0 && len > prec)
9036 len = prec;
9037 break;
9038
9039 case 'i':
9040 case 'd':
9041 case 'u':
9042 case 'o':
9043 case 'x':
9044 case 'X':
9045 if (c == 'i')
9046 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009047 isnumok = 0;
9048 if (PyNumber_Check(v)) {
9049 PyObject *iobj=NULL;
9050
9051 if (PyLong_Check(v)) {
9052 iobj = v;
9053 Py_INCREF(iobj);
9054 }
9055 else {
9056 iobj = PyNumber_Long(v);
9057 }
9058 if (iobj!=NULL) {
9059 if (PyLong_Check(iobj)) {
9060 isnumok = 1;
9061 temp = formatlong(iobj, flags, prec, c);
9062 Py_DECREF(iobj);
9063 if (!temp)
9064 goto onError;
9065 pbuf = PyUnicode_AS_UNICODE(temp);
9066 len = PyUnicode_GET_SIZE(temp);
9067 sign = 1;
9068 }
9069 else {
9070 Py_DECREF(iobj);
9071 }
9072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009074 if (!isnumok) {
9075 PyErr_Format(PyExc_TypeError,
9076 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009077 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009078 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009079 }
9080 if (flags & F_ZERO)
9081 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 break;
9083
9084 case 'e':
9085 case 'E':
9086 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009087 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 case 'g':
9089 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009090 if (c == 'F')
9091 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009092 pbuf = formatbuf;
9093 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9094 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 if (len < 0)
9096 goto onError;
9097 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009098 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 fill = '0';
9100 break;
9101
9102 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009103 pbuf = formatbuf;
9104 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 if (len < 0)
9106 goto onError;
9107 break;
9108
9109 default:
9110 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009111 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009112 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009113 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009114 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009115 (Py_ssize_t)(fmt - 1 -
9116 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 goto onError;
9118 }
9119 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009120 if (*pbuf == '-' || *pbuf == '+') {
9121 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 len--;
9123 }
9124 else if (flags & F_SIGN)
9125 sign = '+';
9126 else if (flags & F_BLANK)
9127 sign = ' ';
9128 else
9129 sign = 0;
9130 }
9131 if (width < len)
9132 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009133 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 reslen -= rescnt;
9135 rescnt = width + fmtcnt + 100;
9136 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009137 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009138 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009139 PyErr_NoMemory();
9140 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009141 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009142 if (_PyUnicode_Resize(&result, reslen) < 0) {
9143 Py_XDECREF(temp);
9144 goto onError;
9145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 res = PyUnicode_AS_UNICODE(result)
9147 + reslen - rescnt;
9148 }
9149 if (sign) {
9150 if (fill != ' ')
9151 *res++ = sign;
9152 rescnt--;
9153 if (width > len)
9154 width--;
9155 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009156 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009157 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009158 assert(pbuf[1] == c);
9159 if (fill != ' ') {
9160 *res++ = *pbuf++;
9161 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009162 }
Tim Petersfff53252001-04-12 18:38:48 +00009163 rescnt -= 2;
9164 width -= 2;
9165 if (width < 0)
9166 width = 0;
9167 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 if (width > len && !(flags & F_LJUST)) {
9170 do {
9171 --rescnt;
9172 *res++ = fill;
9173 } while (--width > len);
9174 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009175 if (fill == ' ') {
9176 if (sign)
9177 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009178 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009179 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009180 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009181 *res++ = *pbuf++;
9182 *res++ = *pbuf++;
9183 }
9184 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009185 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186 res += len;
9187 rescnt -= len;
9188 while (--width >= len) {
9189 --rescnt;
9190 *res++ = ' ';
9191 }
9192 if (dict && (argidx < arglen) && c != '%') {
9193 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009194 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009195 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 goto onError;
9197 }
9198 Py_XDECREF(temp);
9199 } /* '%' */
9200 } /* until end */
9201 if (argidx < arglen && !dict) {
9202 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009203 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204 goto onError;
9205 }
9206
Thomas Woutersa96affe2006-03-12 00:29:36 +00009207 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9208 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209 if (args_owned) {
9210 Py_DECREF(args);
9211 }
9212 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 return (PyObject *)result;
9214
9215 onError:
9216 Py_XDECREF(result);
9217 Py_DECREF(uformat);
9218 if (args_owned) {
9219 Py_DECREF(args);
9220 }
9221 return NULL;
9222}
9223
Jeremy Hylton938ace62002-07-17 16:30:39 +00009224static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009225unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9226
Tim Peters6d6c1a32001-08-02 04:15:00 +00009227static PyObject *
9228unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9229{
9230 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009231 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009232 char *encoding = NULL;
9233 char *errors = NULL;
9234
Guido van Rossume023fe02001-08-30 03:12:59 +00009235 if (type != &PyUnicode_Type)
9236 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009237 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009238 kwlist, &x, &encoding, &errors))
9239 return NULL;
9240 if (x == NULL)
9241 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009242 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009243 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009244 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009245 return PyUnicode_FromEncodedObject(x, encoding, errors);
9246}
9247
Guido van Rossume023fe02001-08-30 03:12:59 +00009248static PyObject *
9249unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9250{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009251 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009252 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009253
9254 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9255 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9256 if (tmp == NULL)
9257 return NULL;
9258 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009259 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009260 if (pnew == NULL) {
9261 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009262 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009263 }
Christian Heimesb186d002008-03-18 15:15:01 +00009264 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009265 if (pnew->str == NULL) {
9266 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009267 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009268 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009269 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009270 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009271 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9272 pnew->length = n;
9273 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009274 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009275 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009276}
9277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009278PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009279"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009280\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009281Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009282encoding defaults to the current default string encoding.\n\
9283errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009284
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009285static PyObject *unicode_iter(PyObject *seq);
9286
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009288 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009289 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 sizeof(PyUnicodeObject), /* tp_size */
9291 0, /* tp_itemsize */
9292 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009293 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009295 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009297 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009298 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009299 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009301 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 (hashfunc) unicode_hash, /* tp_hash*/
9303 0, /* tp_call*/
9304 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009305 PyObject_GenericGetAttr, /* tp_getattro */
9306 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009307 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009308 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9309 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009310 unicode_doc, /* tp_doc */
9311 0, /* tp_traverse */
9312 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009313 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009314 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009315 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009316 0, /* tp_iternext */
9317 unicode_methods, /* tp_methods */
9318 0, /* tp_members */
9319 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009320 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009321 0, /* tp_dict */
9322 0, /* tp_descr_get */
9323 0, /* tp_descr_set */
9324 0, /* tp_dictoffset */
9325 0, /* tp_init */
9326 0, /* tp_alloc */
9327 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009328 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329};
9330
9331/* Initialize the Unicode implementation */
9332
Thomas Wouters78890102000-07-22 19:25:51 +00009333void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009335 int i;
9336
Thomas Wouters477c8d52006-05-27 19:21:47 +00009337 /* XXX - move this array to unicodectype.c ? */
9338 Py_UNICODE linebreak[] = {
9339 0x000A, /* LINE FEED */
9340 0x000D, /* CARRIAGE RETURN */
9341 0x001C, /* FILE SEPARATOR */
9342 0x001D, /* GROUP SEPARATOR */
9343 0x001E, /* RECORD SEPARATOR */
9344 0x0085, /* NEXT LINE */
9345 0x2028, /* LINE SEPARATOR */
9346 0x2029, /* PARAGRAPH SEPARATOR */
9347 };
9348
Fred Drakee4315f52000-05-09 19:53:39 +00009349 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009350 free_list = NULL;
9351 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009353 if (!unicode_empty)
9354 return;
9355
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009356 for (i = 0; i < 256; i++)
9357 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009358 if (PyType_Ready(&PyUnicode_Type) < 0)
9359 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009360
9361 /* initialize the linebreak bloom filter */
9362 bloom_linebreak = make_bloom_mask(
9363 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9364 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009365
9366 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367}
9368
9369/* Finalize the Unicode implementation */
9370
Christian Heimesa156e092008-02-16 07:38:31 +00009371int
9372PyUnicode_ClearFreeList(void)
9373{
9374 int freelist_size = numfree;
9375 PyUnicodeObject *u;
9376
9377 for (u = free_list; u != NULL;) {
9378 PyUnicodeObject *v = u;
9379 u = *(PyUnicodeObject **)u;
9380 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009381 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009382 Py_XDECREF(v->defenc);
9383 PyObject_Del(v);
9384 numfree--;
9385 }
9386 free_list = NULL;
9387 assert(numfree == 0);
9388 return freelist_size;
9389}
9390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391void
Thomas Wouters78890102000-07-22 19:25:51 +00009392_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009394 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009396 Py_XDECREF(unicode_empty);
9397 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009398
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009399 for (i = 0; i < 256; i++) {
9400 if (unicode_latin1[i]) {
9401 Py_DECREF(unicode_latin1[i]);
9402 unicode_latin1[i] = NULL;
9403 }
9404 }
Christian Heimesa156e092008-02-16 07:38:31 +00009405 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009407
Walter Dörwald16807132007-05-25 13:52:07 +00009408void
9409PyUnicode_InternInPlace(PyObject **p)
9410{
9411 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9412 PyObject *t;
9413 if (s == NULL || !PyUnicode_Check(s))
9414 Py_FatalError(
9415 "PyUnicode_InternInPlace: unicode strings only please!");
9416 /* If it's a subclass, we don't really know what putting
9417 it in the interned dict might do. */
9418 if (!PyUnicode_CheckExact(s))
9419 return;
9420 if (PyUnicode_CHECK_INTERNED(s))
9421 return;
9422 if (interned == NULL) {
9423 interned = PyDict_New();
9424 if (interned == NULL) {
9425 PyErr_Clear(); /* Don't leave an exception */
9426 return;
9427 }
9428 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009429 /* It might be that the GetItem call fails even
9430 though the key is present in the dictionary,
9431 namely when this happens during a stack overflow. */
9432 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009433 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009434 Py_END_ALLOW_RECURSION
9435
Walter Dörwald16807132007-05-25 13:52:07 +00009436 if (t) {
9437 Py_INCREF(t);
9438 Py_DECREF(*p);
9439 *p = t;
9440 return;
9441 }
9442
Martin v. Löwis5b222132007-06-10 09:51:05 +00009443 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009444 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9445 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009446 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009447 return;
9448 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009449 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009450 /* The two references in interned are not counted by refcnt.
9451 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009452 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009453 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9454}
9455
9456void
9457PyUnicode_InternImmortal(PyObject **p)
9458{
9459 PyUnicode_InternInPlace(p);
9460 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9461 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9462 Py_INCREF(*p);
9463 }
9464}
9465
9466PyObject *
9467PyUnicode_InternFromString(const char *cp)
9468{
9469 PyObject *s = PyUnicode_FromString(cp);
9470 if (s == NULL)
9471 return NULL;
9472 PyUnicode_InternInPlace(&s);
9473 return s;
9474}
9475
9476void _Py_ReleaseInternedUnicodeStrings(void)
9477{
9478 PyObject *keys;
9479 PyUnicodeObject *s;
9480 Py_ssize_t i, n;
9481 Py_ssize_t immortal_size = 0, mortal_size = 0;
9482
9483 if (interned == NULL || !PyDict_Check(interned))
9484 return;
9485 keys = PyDict_Keys(interned);
9486 if (keys == NULL || !PyList_Check(keys)) {
9487 PyErr_Clear();
9488 return;
9489 }
9490
9491 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9492 detector, interned unicode strings are not forcibly deallocated;
9493 rather, we give them their stolen references back, and then clear
9494 and DECREF the interned dict. */
9495
9496 n = PyList_GET_SIZE(keys);
9497 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9498 n);
9499 for (i = 0; i < n; i++) {
9500 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9501 switch (s->state) {
9502 case SSTATE_NOT_INTERNED:
9503 /* XXX Shouldn't happen */
9504 break;
9505 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009506 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009507 immortal_size += s->length;
9508 break;
9509 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009510 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009511 mortal_size += s->length;
9512 break;
9513 default:
9514 Py_FatalError("Inconsistent interned string state.");
9515 }
9516 s->state = SSTATE_NOT_INTERNED;
9517 }
9518 fprintf(stderr, "total size of all interned strings: "
9519 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9520 "mortal/immortal\n", mortal_size, immortal_size);
9521 Py_DECREF(keys);
9522 PyDict_Clear(interned);
9523 Py_DECREF(interned);
9524 interned = NULL;
9525}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009526
9527
9528/********************* Unicode Iterator **************************/
9529
9530typedef struct {
9531 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009532 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009533 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9534} unicodeiterobject;
9535
9536static void
9537unicodeiter_dealloc(unicodeiterobject *it)
9538{
9539 _PyObject_GC_UNTRACK(it);
9540 Py_XDECREF(it->it_seq);
9541 PyObject_GC_Del(it);
9542}
9543
9544static int
9545unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9546{
9547 Py_VISIT(it->it_seq);
9548 return 0;
9549}
9550
9551static PyObject *
9552unicodeiter_next(unicodeiterobject *it)
9553{
9554 PyUnicodeObject *seq;
9555 PyObject *item;
9556
9557 assert(it != NULL);
9558 seq = it->it_seq;
9559 if (seq == NULL)
9560 return NULL;
9561 assert(PyUnicode_Check(seq));
9562
9563 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009564 item = PyUnicode_FromUnicode(
9565 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009566 if (item != NULL)
9567 ++it->it_index;
9568 return item;
9569 }
9570
9571 Py_DECREF(seq);
9572 it->it_seq = NULL;
9573 return NULL;
9574}
9575
9576static PyObject *
9577unicodeiter_len(unicodeiterobject *it)
9578{
9579 Py_ssize_t len = 0;
9580 if (it->it_seq)
9581 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009582 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009583}
9584
9585PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9586
9587static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009588 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9589 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009590 {NULL, NULL} /* sentinel */
9591};
9592
9593PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009594 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009595 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009596 sizeof(unicodeiterobject), /* tp_basicsize */
9597 0, /* tp_itemsize */
9598 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009599 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009600 0, /* tp_print */
9601 0, /* tp_getattr */
9602 0, /* tp_setattr */
9603 0, /* tp_compare */
9604 0, /* tp_repr */
9605 0, /* tp_as_number */
9606 0, /* tp_as_sequence */
9607 0, /* tp_as_mapping */
9608 0, /* tp_hash */
9609 0, /* tp_call */
9610 0, /* tp_str */
9611 PyObject_GenericGetAttr, /* tp_getattro */
9612 0, /* tp_setattro */
9613 0, /* tp_as_buffer */
9614 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9615 0, /* tp_doc */
9616 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9617 0, /* tp_clear */
9618 0, /* tp_richcompare */
9619 0, /* tp_weaklistoffset */
9620 PyObject_SelfIter, /* tp_iter */
9621 (iternextfunc)unicodeiter_next, /* tp_iternext */
9622 unicodeiter_methods, /* tp_methods */
9623 0,
9624};
9625
9626static PyObject *
9627unicode_iter(PyObject *seq)
9628{
9629 unicodeiterobject *it;
9630
9631 if (!PyUnicode_Check(seq)) {
9632 PyErr_BadInternalCall();
9633 return NULL;
9634 }
9635 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9636 if (it == NULL)
9637 return NULL;
9638 it->it_index = 0;
9639 Py_INCREF(seq);
9640 it->it_seq = (PyUnicodeObject *)seq;
9641 _PyObject_GC_TRACK(it);
9642 return (PyObject *)it;
9643}
9644
Martin v. Löwis5b222132007-06-10 09:51:05 +00009645size_t
9646Py_UNICODE_strlen(const Py_UNICODE *u)
9647{
9648 int res = 0;
9649 while(*u++)
9650 res++;
9651 return res;
9652}
9653
9654Py_UNICODE*
9655Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9656{
9657 Py_UNICODE *u = s1;
9658 while ((*u++ = *s2++));
9659 return s1;
9660}
9661
9662Py_UNICODE*
9663Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9664{
9665 Py_UNICODE *u = s1;
9666 while ((*u++ = *s2++))
9667 if (n-- == 0)
9668 break;
9669 return s1;
9670}
9671
9672int
9673Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9674{
9675 while (*s1 && *s2 && *s1 == *s2)
9676 s1++, s2++;
9677 if (*s1 && *s2)
9678 return (*s1 < *s2) ? -1 : +1;
9679 if (*s1)
9680 return 1;
9681 if (*s2)
9682 return -1;
9683 return 0;
9684}
9685
9686Py_UNICODE*
9687Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9688{
9689 const Py_UNICODE *p;
9690 for (p = s; *p; p++)
9691 if (*p == c)
9692 return (Py_UNICODE*)p;
9693 return NULL;
9694}
9695
9696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009697#ifdef __cplusplus
9698}
9699#endif
9700
9701
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009702/*
9703Local variables:
9704c-basic-offset: 4
9705indent-tabs-mode: nil
9706End:
9707*/