blob: 8316e91bdb448a83037621604a110f452f45a048 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
428 PyErr_BadInternalCall();
429 return -1;
430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 PyErr_BadInternalCall();
434 return -1;
435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000447 Py_DECREF(*unicode);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000448 *unicode = w;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 return 0;
450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
475 }
476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 if (!unicode)
484 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000485 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
510 "Negative size passed to PyUnicode_FromStringAndSize");
511 return NULL;
512 }
513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
524 }
525
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000529 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000534 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566{
567 PyUnicodeObject *unicode;
568
569 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000570 if (size == 0)
571 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572 PyErr_BadInternalCall();
573 return NULL;
574 }
575
Martin v. Löwis790465f2008-04-05 20:41:37 +0000576 if (size == -1) {
577 size = wcslen(w);
578 }
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 unicode = _PyUnicode_New(size);
581 if (!unicode)
582 return NULL;
583
584 /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000587#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 {
589 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000592 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 *u++ = *w++;
594 }
595#endif
596
597 return (PyObject *)unicode;
598}
599
Walter Dörwald346737f2007-05-31 10:44:43 +0000600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603 *fmt++ = '%';
604 if (width) {
605 if (zeropad)
606 *fmt++ = '0';
607 fmt += sprintf(fmt, "%d", width);
608 }
609 if (precision)
610 fmt += sprintf(fmt, ".%d", precision);
611 if (longflag)
612 *fmt++ = 'l';
613 else if (size_tflag) {
614 char *f = PY_FORMAT_SIZE_T;
615 while (*f)
616 *fmt++ = *f++;
617 }
618 *fmt++ = c;
619 *fmt = '\0';
620}
621
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000628 Py_ssize_t callcount = 0;
629 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000630 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000631 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000632 int width = 0;
633 int precision = 0;
634 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000635 const char* f;
636 Py_UNICODE *s;
637 PyObject *string;
638 /* used by sprintf */
639 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000640 /* use abuffer instead of buffer, if we need more space
641 * (which can happen if there's a format specifier with width). */
642 char *abuffer = NULL;
643 char *realbuffer;
644 Py_ssize_t abuffersize = 0;
645 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646 const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649 Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef __va_copy
652 __va_copy(count, vargs);
653#else
654 count = vargs;
655#endif
656#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000657 /* step 1: count the number of %S/%R/%A format specifications
658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659 * these objects once during step 3 and put the result in
660 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000663 ++callcount;
664 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000665 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000666 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000669 if (!callresults) {
670 PyErr_NoMemory();
671 return NULL;
672 }
673 callresult = callresults;
674 }
675 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 for (f = format; *f; f++) {
677 if (*f == '%') {
678 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000681 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683 ;
684
685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686 * they don't affect the amount of space we reserve.
687 */
688 if ((*f == 'l' || *f == 'z') &&
689 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000690 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691
692 switch (*f) {
693 case 'c':
694 (void)va_arg(count, int);
695 /* fall through... */
696 case '%':
697 n++;
698 break;
699 case 'd': case 'u': case 'i': case 'x':
700 (void) va_arg(count, int);
701 /* 20 bytes is enough to hold a 64-bit
702 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000703 This isn't enough for octal.
704 If a width is specified we need more
705 (which we allocate later). */
706 if (width < 20)
707 width = 20;
708 n += width;
709 if (abuffersize < width)
710 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711 break;
712 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000713 {
714 /* UTF-8 */
715 unsigned char*s;
716 s = va_arg(count, unsigned char*);
717 while (*s) {
718 if (*s < 128) {
719 n++; s++;
720 } else if (*s < 0xc0) {
721 /* invalid UTF-8 */
722 n++; s++;
723 } else if (*s < 0xc0) {
724 n++;
725 s++; if(!*s)break;
726 s++;
727 } else if (*s < 0xe0) {
728 n++;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 } else {
733 #ifdef Py_UNICODE_WIDE
734 n++;
735 #else
736 n+=2;
737 #endif
738 s++; if(!*s)break;
739 s++; if(!*s)break;
740 s++; if(!*s)break;
741 s++;
742 }
743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000745 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 case 'U':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 assert(obj && PyUnicode_Check(obj));
750 n += PyUnicode_GET_SIZE(obj);
751 break;
752 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000753 case 'V':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 const char *str = va_arg(count, const char *);
757 assert(obj || str);
758 assert(!obj || PyUnicode_Check(obj));
759 if (obj)
760 n += PyUnicode_GET_SIZE(obj);
761 else
762 n += strlen(str);
763 break;
764 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000765 case 'S':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 PyObject *str;
769 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000770 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000771 if (!str)
772 goto fail;
773 n += PyUnicode_GET_SIZE(str);
774 /* Remember the str and switch to the next slot */
775 *callresult++ = str;
776 break;
777 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000778 case 'R':
779 {
780 PyObject *obj = va_arg(count, PyObject *);
781 PyObject *repr;
782 assert(obj);
783 repr = PyObject_Repr(obj);
784 if (!repr)
785 goto fail;
786 n += PyUnicode_GET_SIZE(repr);
787 /* Remember the repr and switch to the next slot */
788 *callresult++ = repr;
789 break;
790 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000791 case 'A':
792 {
793 PyObject *obj = va_arg(count, PyObject *);
794 PyObject *ascii;
795 assert(obj);
796 ascii = PyObject_ASCII(obj);
797 if (!ascii)
798 goto fail;
799 n += PyUnicode_GET_SIZE(ascii);
800 /* Remember the repr and switch to the next slot */
801 *callresult++ = ascii;
802 break;
803 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 case 'p':
805 (void) va_arg(count, int);
806 /* maximum 64-bit pointer representation:
807 * 0xffffffffffffffff
808 * so 19 characters is enough.
809 * XXX I count 18 -- what's the extra for?
810 */
811 n += 19;
812 break;
813 default:
814 /* if we stumble upon an unknown
815 formatting code, copy the rest of
816 the format string to the output
817 string. (we cannot just skip the
818 code, since there's no way to know
819 what's in the argument list) */
820 n += strlen(p);
821 goto expand;
822 }
823 } else
824 n++;
825 }
826 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000828 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000829 if (!abuffer) {
830 PyErr_NoMemory();
831 goto fail;
832 }
833 realbuffer = abuffer;
834 }
835 else
836 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 we don't have to resize the string.
840 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 string = PyUnicode_FromUnicode(NULL, n);
842 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844
845 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000846 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847
848 for (f = format; *f; f++) {
849 if (*f == '%') {
850 const char* p = f++;
851 int longflag = 0;
852 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 zeropad = (*f == '0');
854 /* parse the width.precision part */
855 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000856 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 width = (width*10) + *f++ - '0';
858 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (*f == '.') {
860 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000861 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 /* handle the long flag, but only for %ld and %lu.
865 others can be added when necessary. */
866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867 longflag = 1;
868 ++f;
869 }
870 /* handle the size_t flag. */
871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872 size_tflag = 1;
873 ++f;
874 }
875
876 switch (*f) {
877 case 'c':
878 *s++ = va_arg(vargs, int);
879 break;
880 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000887 sprintf(realbuffer, fmt, va_arg(vargs, int));
888 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 break;
890 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
905 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 break;
910 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000911 {
912 /* Parameter must be UTF-8 encoded.
913 In case of encoding errors, use
914 the replacement character. */
915 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000917 u = PyUnicode_DecodeUTF8(p, strlen(p),
918 "replace");
919 if (!u)
920 goto fail;
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922 PyUnicode_GET_SIZE(u));
923 s += PyUnicode_GET_SIZE(u);
924 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000926 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000927 case 'U':
928 {
929 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000933 break;
934 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000935 case 'V':
936 {
937 PyObject *obj = va_arg(vargs, PyObject *);
938 const char *str = va_arg(vargs, const char *);
939 if (obj) {
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 } else {
944 appendstring(str);
945 }
946 break;
947 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000948 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000949 case 'R':
950 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000951 Py_UNICODE *ucopy;
952 Py_ssize_t usize;
953 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 /* unused, since we already have the result */
955 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000956 ucopy = PyUnicode_AS_UNICODE(*callresult);
957 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 for (upos = 0; upos<usize;)
959 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000962 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000963 ++callresult;
964 break;
965 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966 case 'p':
967 sprintf(buffer, "%p", va_arg(vargs, void*));
968 /* %p is ill-defined: ensure leading 0x. */
969 if (buffer[1] == 'X')
970 buffer[1] = 'x';
971 else if (buffer[1] != 'x') {
972 memmove(buffer+2, buffer, strlen(buffer)+1);
973 buffer[0] = '0';
974 buffer[1] = 'x';
975 }
976 appendstring(buffer);
977 break;
978 case '%':
979 *s++ = '%';
980 break;
981 default:
982 appendstring(p);
983 goto end;
984 }
985 } else
986 *s++ = *f;
987 }
988
989 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000990 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000992 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000993 PyObject_Free(abuffer);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000996 fail:
997 if (callresults) {
998 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000999 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 Py_DECREF(*callresult2);
1001 ++callresult2;
1002 }
Christian Heimesb186d002008-03-18 15:15:01 +00001003 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001004 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001005 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001006 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001007 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015 PyObject* ret;
1016 va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019 va_start(vargs, format);
1020#else
1021 va_start(vargs);
1022#endif
1023 ret = PyUnicode_FromFormatV(format, vargs);
1024 va_end(vargs);
1025 return ret;
1026}
1027
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029 wchar_t *w,
1030 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031{
1032 if (unicode == NULL) {
1033 PyErr_BadInternalCall();
1034 return -1;
1035 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001036
1037 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039 size = PyUnicode_GET_SIZE(unicode) + 1;
1040
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041#ifdef HAVE_USABLE_WCHAR_T
1042 memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044 {
1045 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001046 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001048 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 *w++ = *u++;
1050 }
1051#endif
1052
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001053 if (size > PyUnicode_GET_SIZE(unicode))
1054 return PyUnicode_GET_SIZE(unicode);
1055 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 return size;
1057}
1058
1059#endif
1060
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065 if (ordinal < 0 || ordinal > 0x10ffff) {
1066 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001067 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 return NULL;
1069 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001070
1071#ifndef Py_UNICODE_WIDE
1072 if (ordinal > 0xffff) {
1073 ordinal -= 0x10000;
1074 s[0] = 0xD800 | (ordinal >> 10);
1075 s[1] = 0xDC00 | (ordinal & 0x3FF);
1076 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#endif
1079
Hye-Shik Chang40574832004-04-06 07:24:51 +00001080 s[0] = (Py_UNICODE)ordinal;
1081 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001082}
1083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 if (PyUnicode_CheckExact(obj)) {
1089 Py_INCREF(obj);
1090 return obj;
1091 }
1092 if (PyUnicode_Check(obj)) {
1093 /* For a Unicode subtype that's not a Unicode object,
1094 return a true Unicode object with the same data. */
1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096 PyUnicode_GET_SIZE(obj));
1097 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001098 PyErr_Format(PyExc_TypeError,
1099 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001100 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001101 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105 const char *encoding,
1106 const char *errors)
1107{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (obj == NULL) {
1113 PyErr_BadInternalCall();
1114 return NULL;
1115 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001117 if (PyUnicode_Check(obj)) {
1118 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001119 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122
1123 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001124 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001125 s = PyBytes_AS_STRING(obj);
1126 len = PyBytes_GET_SIZE(obj);
1127 }
1128 else if (PyByteArray_Check(obj)) {
1129 s = PyByteArray_AS_STRING(obj);
1130 len = PyByteArray_GET_SIZE(obj);
1131 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001132 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133 /* Overwrite the error message with something more useful in
1134 case of a TypeError. */
1135 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001137 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001138 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001139 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001140 goto onError;
1141 }
Tim Petersced69f82003-09-16 20:30:58 +00001142
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001143 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 if (len == 0) {
1145 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001146 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 }
Tim Petersced69f82003-09-16 20:30:58 +00001148 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001150
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001151 return v;
1152
1153 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 const char *encoding,
1160 const char *errors)
1161{
1162 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001163 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 char lower[20]; /* Enough for any encoding name we recognize */
1165 char *l;
1166 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
1168 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 encoding = PyUnicode_GetDefaultEncoding();
1170
1171 /* Convert encoding to lower case and replace '_' with '-' in order to
1172 catch e.g. UTF_8 */
1173 e = encoding;
1174 l = lower;
1175 while (*e && l < &lower[(sizeof lower) - 2]) {
1176 if (ISUPPER(*e)) {
1177 *l++ = TOLOWER(*e++);
1178 }
1179 else if (*e == '_') {
1180 *l++ = '-';
1181 e++;
1182 }
1183 else {
1184 *l++ = *e++;
1185 }
1186 }
1187 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001188
1189 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if ((strcmp(lower, "latin-1") == 0) ||
1193 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001196 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001201 else if (strcmp(lower, "utf-16") == 0)
1202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203 else if (strcmp(lower, "utf-32") == 0)
1204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (buffer == NULL)
1212 goto onError;
1213 unicode = PyCodec_Decode(buffer, encoding, errors);
1214 if (unicode == NULL)
1215 goto onError;
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_DECREF(unicode);
1221 goto onError;
1222 }
1223 Py_DECREF(buffer);
1224 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 onError:
1227 Py_XDECREF(buffer);
1228 return NULL;
1229}
1230
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v;
1236
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Decode via the codec registry */
1246 v = PyCodec_Decode(unicode, encoding, errors);
1247 if (v == NULL)
1248 goto onError;
1249 return v;
1250
1251 onError:
1252 return NULL;
1253}
1254
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256 const char *encoding,
1257 const char *errors)
1258{
1259 PyObject *v;
1260
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 goto onError;
1264 }
1265
1266 if (encoding == NULL)
1267 encoding = PyUnicode_GetDefaultEncoding();
1268
1269 /* Decode via the codec registry */
1270 v = PyCodec_Decode(unicode, encoding, errors);
1271 if (v == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(v)) {
1274 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001275 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001276 Py_TYPE(v)->tp_name);
1277 Py_DECREF(v);
1278 goto onError;
1279 }
1280 return v;
1281
1282 onError:
1283 return NULL;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 unicode = PyUnicode_FromUnicode(s, size);
1294 if (unicode == NULL)
1295 return NULL;
1296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297 Py_DECREF(unicode);
1298 return v;
1299}
1300
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302 const char *encoding,
1303 const char *errors)
1304{
1305 PyObject *v;
1306
1307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadArgument();
1309 goto onError;
1310 }
1311
1312 if (encoding == NULL)
1313 encoding = PyUnicode_GetDefaultEncoding();
1314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 return v;
1320
1321 onError:
1322 return NULL;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326 const char *encoding,
1327 const char *errors)
1328{
1329 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (!PyUnicode_Check(unicode)) {
1332 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 }
Fred Drakee4315f52000-05-09 19:53:39 +00001335
Tim Petersced69f82003-09-16 20:30:58 +00001336 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001337 encoding = PyUnicode_GetDefaultEncoding();
1338
1339 /* Shortcuts for common default encodings */
1340 if (errors == NULL) {
1341 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001342 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001343 else if (strcmp(encoding, "latin-1") == 0)
1344 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346 else if (strcmp(encoding, "mbcs") == 0)
1347 return PyUnicode_AsMBCSString(unicode);
1348#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001349 else if (strcmp(encoding, "ascii") == 0)
1350 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001351 /* During bootstrap, we may need to find the encodings
1352 package, to load the file system encoding, and require the
1353 file system encoding in order to load the encodings
1354 package.
1355
1356 Break out of this dependency by assuming that the path to
1357 the encodings module is ASCII-only. XXX could try wcstombs
1358 instead, if the file system encoding is the locale's
1359 encoding. */
1360 else if (Py_FileSystemDefaultEncoding &&
1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362 !PyThreadState_GET()->interp->codecs_initialized)
1363 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
1366 /* Encode via the codec registry */
1367 v = PyCodec_Encode(unicode, encoding, errors);
1368 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001369 return NULL;
1370
1371 /* The normal path */
1372 if (PyBytes_Check(v))
1373 return v;
1374
1375 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376 if (PyByteArray_Check(v)) {
1377 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001378 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001379 PyOS_snprintf(msg, sizeof(msg),
1380 "encoder %s returned buffer instead of bytes",
1381 encoding);
1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001383 Py_DECREF(v);
1384 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001385 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388 Py_DECREF(v);
1389 return b;
1390 }
1391
1392 PyErr_Format(PyExc_TypeError,
1393 "encoder did not return a bytes object (type=%.400s)",
1394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
1411 encoding = PyUnicode_GetDefaultEncoding();
1412
1413 /* Encode via the codec registry */
1414 v = PyCodec_Encode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 onError:
1427 return NULL;
1428}
1429
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431 const char *errors)
1432{
1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001434 if (v)
1435 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001436 if (errors != NULL)
1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001439 PyUnicode_GET_SIZE(unicode),
1440 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001441 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001442 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001443 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001444 return v;
1445}
1446
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001448PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001450 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001452
Christian Heimes5894ba72007-11-04 11:43:14 +00001453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457 can be undefined. If it is case, decode using UTF-8. The following assumes
1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459 bootstrapping process where the codecs aren't ready yet.
1460 */
1461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001464 return PyUnicode_DecodeMBCS(s, size, "replace");
1465 }
1466#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001468 return PyUnicode_DecodeUTF8(s, size, "replace");
1469 }
1470#endif
1471 return PyUnicode_Decode(s, size,
1472 Py_FileSystemDefaultEncoding,
1473 "replace");
1474 }
1475 else {
1476 return PyUnicode_DecodeUTF8(s, size, "replace");
1477 }
1478}
1479
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001482{
Christian Heimesf3863112007-11-22 07:46:41 +00001483 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
1486 return NULL;
1487 }
Christian Heimesf3863112007-11-22 07:46:41 +00001488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001490 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001491 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001492 *psize = PyBytes_GET_SIZE(bytes);
1493 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001494}
1495
1496char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001498{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001499 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001500}
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 goto onError;
1507 }
1508 return PyUnicode_AS_UNICODE(unicode);
1509
1510 onError:
1511 return NULL;
1512}
1513
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515{
1516 if (!PyUnicode_Check(unicode)) {
1517 PyErr_BadArgument();
1518 goto onError;
1519 }
1520 return PyUnicode_GET_SIZE(unicode);
1521
1522 onError:
1523 return -1;
1524}
1525
Thomas Wouters78890102000-07-22 19:25:51 +00001526const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001527{
1528 return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001533 if (strcmp(encoding, unicode_default_encoding) != 0) {
1534 PyErr_Format(PyExc_ValueError,
1535 "Can only set default encoding to %s",
1536 unicode_default_encoding);
1537 return -1;
1538 }
Fred Drakee4315f52000-05-09 19:53:39 +00001539 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001540}
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542/* error handling callback helper:
1543 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001544 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 and adjust various state variables.
1546 return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001552 const char **input, const char **inend, Py_ssize_t *startinpos,
1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557
1558 PyObject *restuple = NULL;
1559 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t requiredsize;
1563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001565 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 int res = -1;
1568
1569 if (*errorHandler == NULL) {
1570 *errorHandler = PyCodec_LookupError(errors);
1571 if (*errorHandler == NULL)
1572 goto onError;
1573 }
1574
1575 if (*exceptionObject == NULL) {
1576 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 if (*exceptionObject == NULL)
1579 goto onError;
1580 }
1581 else {
1582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585 goto onError;
1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587 goto onError;
1588 }
1589
1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591 if (restuple == NULL)
1592 goto onError;
1593 if (!PyTuple_Check(restuple)) {
1594 PyErr_Format(PyExc_TypeError, &argparse[4]);
1595 goto onError;
1596 }
1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
1600 /* Copy back the bytes variables, which might have been modified by the
1601 callback */
1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603 if (!inputobj)
1604 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001605 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001608 *input = PyBytes_AS_STRING(inputobj);
1609 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001610 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001611 /* we can DECREF safely, as the exception has another reference,
1612 so the object won't go away. */
1613 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001616 newpos = insize+newpos;
1617 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001619 goto onError;
1620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 /* need more space? (at least enough for what we
1623 have+the replacement+the rest of the string (starting
1624 at the new input position), so we won't have to check space
1625 when there are no errors in the rest of the string) */
1626 repptr = PyUnicode_AS_UNICODE(repunicode);
1627 repsize = PyUnicode_GET_SIZE(repunicode);
1628 requiredsize = *outpos + repsize + insize-newpos;
1629 if (requiredsize > outsize) {
1630 if (requiredsize<2*outsize)
1631 requiredsize = 2*outsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632 if (_PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 goto onError;
1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635 }
1636 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001637 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_UNICODE_COPY(*outptr, repptr, repsize);
1639 *outptr += repsize;
1640 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 /* we made it! */
1643 res = 0;
1644
1645 onError:
1646 Py_XDECREF(restuple);
1647 return res;
1648}
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
Tim Petersced69f82003-09-16 20:30:58 +00001654static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655char utf7_special[128] = {
1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657 encoded:
1658 0 - not special
1659 1 - special
1660 2 - whitespace (optional)
1661 3 - RFC2152 Set O (optional) */
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674 warnings about the comparison always being false; since
1675 utf7_special[0] is 1, we can safely make that one comparison
1676 true */
1677
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001680 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 (encodeO && (utf7_special[(c)] == 3)))
1682
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001683#define B64(n) \
1684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001686 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001687#define UB64(c) \
1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001691#define ENCODE(out, ch, bits) \
1692 while (bits >= 6) { \
1693 *out++ = B64(ch >> (bits-6)); \
1694 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001697#define DECODE(out, ch, bits, surrogate) \
1698 while (bits >= 16) { \
1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1700 bits -= 16; \
1701 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001702 /* We have already generated an error for the high surrogate \
1703 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001704 surrogate = 0; \
1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001707 it in a 16-bit character */ \
1708 surrogate = 1; \
1709 errmsg = "code pairs are not supported"; \
1710 goto utf7Error; \
1711 } else { \
1712 *out++ = outCh; \
1713 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 const char *errors)
1719{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724 Py_ssize_t size,
1725 const char *errors,
1726 Py_ssize_t *consumed)
1727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729 Py_ssize_t startinpos;
1730 Py_ssize_t endinpos;
1731 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 const char *e;
1733 PyUnicodeObject *unicode;
1734 Py_UNICODE *p;
1735 const char *errmsg = "";
1736 int inShift = 0;
1737 unsigned int bitsleft = 0;
1738 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int surrogate = 0;
1740 PyObject *errorHandler = NULL;
1741 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
1743 unicode = _PyUnicode_New(size);
1744 if (!unicode)
1745 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001746 if (size == 0) {
1747 if (consumed)
1748 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
1752 p = unicode->str;
1753 e = s + size;
1754
1755 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_UNICODE ch;
1757 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001758 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
1760 if (inShift) {
1761 if ((ch == '-') || !B64CHAR(ch)) {
1762 inShift = 0;
1763 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766 if (bitsleft >= 6) {
1767 /* The shift sequence has a partial character in it. If
1768 bitsleft < 6 then we could just classify it as padding
1769 but that is not the case here */
1770
1771 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001772 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 }
1774 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001775 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 here so indicate the potential of a misencoded character. */
1777
1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001781 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
1783
1784 if (ch == '-') {
1785 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001786 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 inShift = 1;
1788 }
1789 } else if (SPECIAL(ch,0,0)) {
1790 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 } else {
1793 *p++ = ch;
1794 }
1795 } else {
1796 charsleft = (charsleft << 6) | UB64(ch);
1797 bitsleft += 6;
1798 s++;
1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800 }
1801 }
1802 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 s++;
1805 if (s < e && *s == '-') {
1806 s++;
1807 *p++ = '+';
1808 } else
1809 {
1810 inShift = 1;
1811 bitsleft = 0;
1812 }
1813 }
1814 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001815 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 errmsg = "unexpected special character";
1817 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001818 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820 else {
1821 *p++ = ch;
1822 s++;
1823 }
1824 continue;
1825 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 outpos = p-PyUnicode_AS_UNICODE(unicode);
1827 endinpos = s-starts;
1828 if (unicode_decode_call_errorhandler(
1829 errors, &errorHandler,
1830 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001831 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001832 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 }
1835
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001836 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 outpos = p-PyUnicode_AS_UNICODE(unicode);
1838 endinpos = size;
1839 if (unicode_decode_call_errorhandler(
1840 errors, &errorHandler,
1841 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001843 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 if (s < e)
1846 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001848 if (consumed) {
1849 if(inShift)
1850 *consumed = startinpos;
1851 else
1852 *consumed = s-starts;
1853 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 return (PyObject *)unicode;
1861
1862onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 int encodeSetO,
1873 int encodeWhiteSpace,
1874 const char *errors)
1875{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001876 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 unsigned int bitsleft = 0;
1882 unsigned long charsleft = 0;
1883 char * out;
1884 char * start;
1885
1886 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001887 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001889 if (cbAllocated / 5 != size)
1890 return PyErr_NoMemory();
1891
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001892 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (v == NULL)
1894 return NULL;
1895
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001896 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 for (;i < size; ++i) {
1898 Py_UNICODE ch = s[i];
1899
1900 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001901 if (ch == '+') {
1902 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *out++ = '-';
1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905 charsleft = ch;
1906 bitsleft = 16;
1907 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 } else {
1911 *out++ = (char) ch;
1912 }
1913 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915 *out++ = B64(charsleft << (6-bitsleft));
1916 charsleft = 0;
1917 bitsleft = 0;
1918 /* Characters not in the BASE64 set implicitly unshift the sequence
1919 so no '-' is required, except if the character is itself a '-' */
1920 if (B64CHAR(ch) || ch == '-') {
1921 *out++ = '-';
1922 }
1923 inShift = 0;
1924 *out++ = (char) ch;
1925 } else {
1926 bitsleft += 16;
1927 charsleft = (charsleft << 16) | ch;
1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001931 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 or '-' then the shift sequence will be terminated implicitly and we
1933 don't have to insert a '-'. */
1934
1935 if (bitsleft == 0) {
1936 if (i + 1 < size) {
1937 Py_UNICODE ch2 = s[i+1];
1938
1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 } else if (B64CHAR(ch2) || ch2 == '-') {
1942 *out++ = '-';
1943 inShift = 0;
1944 } else {
1945 inShift = 0;
1946 }
1947
1948 }
1949 else {
1950 *out++ = '-';
1951 inShift = 0;
1952 }
1953 }
Tim Petersced69f82003-09-16 20:30:58 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (bitsleft) {
1958 *out++= B64(charsleft << (6-bitsleft) );
1959 *out++ = '-';
1960 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001961 if (_PyBytes_Resize(&v, out - start) < 0)
1962 return NULL;
1963 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001964}
1965
1966#undef SPECIAL
1967#undef B64
1968#undef B64CHAR
1969#undef UB64
1970#undef ENCODE
1971#undef DECODE
1972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973/* --- UTF-8 Codec -------------------------------------------------------- */
1974
Tim Petersced69f82003-09-16 20:30:58 +00001975static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976char utf8_code_length[256] = {
1977 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1978 illegal prefix. see RFC 2279 for details */
1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1994 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1995};
1996
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001998 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 const char *errors)
2000{
Walter Dörwald69652032004-09-07 20:24:22 +00002001 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2002}
2003
2004PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002005 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002006 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002007 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002008{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002011 Py_ssize_t startinpos;
2012 Py_ssize_t endinpos;
2013 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014 const char *e;
2015 PyUnicodeObject *unicode;
2016 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002017 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 PyObject *errorHandler = NULL;
2019 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020
2021 /* Note: size will always be longer than the resulting Unicode
2022 character count */
2023 unicode = _PyUnicode_New(size);
2024 if (!unicode)
2025 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002026 if (size == 0) {
2027 if (consumed)
2028 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031
2032 /* Unpack UTF-8 encoded data */
2033 p = unicode->str;
2034 e = s + size;
2035
2036 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002037 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
2039 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002040 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 s++;
2042 continue;
2043 }
2044
2045 n = utf8_code_length[ch];
2046
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002047 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002048 if (consumed)
2049 break;
2050 else {
2051 errmsg = "unexpected end of data";
2052 startinpos = s-starts;
2053 endinpos = size;
2054 goto utf8Error;
2055 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 switch (n) {
2059
2060 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002061 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
2063 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002064 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065
2066 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002067 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 startinpos = s-starts;
2069 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002070 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
2072 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002073 if ((s[1] & 0xc0) != 0x80) {
2074 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 startinpos = s-starts;
2076 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002077 goto utf8Error;
2078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002080 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 startinpos = s-starts;
2082 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002083 errmsg = "illegal encoding";
2084 goto utf8Error;
2085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002087 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 break;
2089
2090 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002091 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002092 (s[2] & 0xc0) != 0x80) {
2093 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 startinpos = s-starts;
2095 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002096 goto utf8Error;
2097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002099 if (ch < 0x0800) {
2100 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002101 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002102
2103 XXX For wide builds (UCS-4) we should probably try
2104 to recombine the surrogates into a single code
2105 unit.
2106 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002107 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108 startinpos = s-starts;
2109 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002110 goto utf8Error;
2111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002113 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002114 break;
2115
2116 case 4:
2117 if ((s[1] & 0xc0) != 0x80 ||
2118 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002119 (s[3] & 0xc0) != 0x80) {
2120 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 startinpos = s-starts;
2122 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002123 goto utf8Error;
2124 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002125 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2126 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2127 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002128 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002129 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002130 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002131 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002132 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002133 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 startinpos = s-starts;
2135 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002136 goto utf8Error;
2137 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002138#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002139 *p++ = (Py_UNICODE)ch;
2140#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002142
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002143 /* translate from 10000..10FFFF to 0..FFFF */
2144 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002145
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002146 /* high surrogate = top 10 bits added to D800 */
2147 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002148
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002149 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002150 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002151#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 break;
2153
2154 default:
2155 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002156 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002157 startinpos = s-starts;
2158 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002159 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 }
2161 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002162 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002163
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002164 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 outpos = p-PyUnicode_AS_UNICODE(unicode);
2166 if (unicode_decode_call_errorhandler(
2167 errors, &errorHandler,
2168 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002169 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002170 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 }
Walter Dörwald69652032004-09-07 20:24:22 +00002173 if (consumed)
2174 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175
2176 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002177 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 goto onError;
2179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002180 Py_XDECREF(errorHandler);
2181 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 return (PyObject *)unicode;
2183
2184onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002185 Py_XDECREF(errorHandler);
2186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 Py_DECREF(unicode);
2188 return NULL;
2189}
2190
Tim Peters602f7402002-04-27 18:03:26 +00002191/* Allocation strategy: if the string is short, convert into a stack buffer
2192 and allocate exactly as much space needed at the end. Else allocate the
2193 maximum possible needed (4 result bytes per Unicode character), and return
2194 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002195*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002196PyObject *
2197PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002198 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200{
Tim Peters602f7402002-04-27 18:03:26 +00002201#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002202
Guido van Rossum98297ee2007-11-06 21:34:58 +00002203 Py_ssize_t i; /* index into s of next input byte */
2204 PyObject *result; /* result string object */
2205 char *p; /* next free byte in output buffer */
2206 Py_ssize_t nallocated; /* number of result bytes allocated */
2207 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002208 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002209
Tim Peters602f7402002-04-27 18:03:26 +00002210 assert(s != NULL);
2211 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Tim Peters602f7402002-04-27 18:03:26 +00002213 if (size <= MAX_SHORT_UNICHARS) {
2214 /* Write into the stack buffer; nallocated can't overflow.
2215 * At the end, we'll allocate exactly as much heap space as it
2216 * turns out we need.
2217 */
2218 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002219 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002220 p = stackbuf;
2221 }
2222 else {
2223 /* Overallocate on the heap, and give the excess back at the end. */
2224 nallocated = size * 4;
2225 if (nallocated / 4 != size) /* overflow! */
2226 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002227 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002228 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002229 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002230 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002231 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002232
Tim Peters602f7402002-04-27 18:03:26 +00002233 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002234 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002235
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002236 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002237 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002239
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002241 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002242 *p++ = (char)(0xc0 | (ch >> 6));
2243 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002244 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002245 else {
Tim Peters602f7402002-04-27 18:03:26 +00002246 /* Encode UCS2 Unicode ordinals */
2247 if (ch < 0x10000) {
2248 /* Special case: check for high surrogate */
2249 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2250 Py_UCS4 ch2 = s[i];
2251 /* Check for low surrogate and combine the two to
2252 form a UCS4 value */
2253 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002254 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002255 i++;
2256 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002257 }
Tim Peters602f7402002-04-27 18:03:26 +00002258 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002260 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002261 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2262 *p++ = (char)(0x80 | (ch & 0x3f));
2263 continue;
2264 }
2265encodeUCS4:
2266 /* Encode UCS4 Unicode ordinals */
2267 *p++ = (char)(0xf0 | (ch >> 18));
2268 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2269 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2270 *p++ = (char)(0x80 | (ch & 0x3f));
2271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002273
Guido van Rossum98297ee2007-11-06 21:34:58 +00002274 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002275 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002276 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002277 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002278 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002279 }
2280 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002281 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002282 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002283 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002284 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002285 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002286 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002287
Tim Peters602f7402002-04-27 18:03:26 +00002288#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289}
2290
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 if (!PyUnicode_Check(unicode)) {
2294 PyErr_BadArgument();
2295 return NULL;
2296 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002297 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2298 PyUnicode_GET_SIZE(unicode),
2299 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300}
2301
Walter Dörwald41980ca2007-08-16 21:55:45 +00002302/* --- UTF-32 Codec ------------------------------------------------------- */
2303
2304PyObject *
2305PyUnicode_DecodeUTF32(const char *s,
2306 Py_ssize_t size,
2307 const char *errors,
2308 int *byteorder)
2309{
2310 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2311}
2312
2313PyObject *
2314PyUnicode_DecodeUTF32Stateful(const char *s,
2315 Py_ssize_t size,
2316 const char *errors,
2317 int *byteorder,
2318 Py_ssize_t *consumed)
2319{
2320 const char *starts = s;
2321 Py_ssize_t startinpos;
2322 Py_ssize_t endinpos;
2323 Py_ssize_t outpos;
2324 PyUnicodeObject *unicode;
2325 Py_UNICODE *p;
2326#ifndef Py_UNICODE_WIDE
2327 int i, pairs;
2328#else
2329 const int pairs = 0;
2330#endif
2331 const unsigned char *q, *e;
2332 int bo = 0; /* assume native ordering by default */
2333 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002334 /* Offsets from q for retrieving bytes in the right order. */
2335#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2336 int iorder[] = {0, 1, 2, 3};
2337#else
2338 int iorder[] = {3, 2, 1, 0};
2339#endif
2340 PyObject *errorHandler = NULL;
2341 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002342 /* On narrow builds we split characters outside the BMP into two
2343 codepoints => count how much extra space we need. */
2344#ifndef Py_UNICODE_WIDE
2345 for (i = pairs = 0; i < size/4; i++)
2346 if (((Py_UCS4 *)s)[i] >= 0x10000)
2347 pairs++;
2348#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002349
2350 /* This might be one to much, because of a BOM */
2351 unicode = _PyUnicode_New((size+3)/4+pairs);
2352 if (!unicode)
2353 return NULL;
2354 if (size == 0)
2355 return (PyObject *)unicode;
2356
2357 /* Unpack UTF-32 encoded data */
2358 p = unicode->str;
2359 q = (unsigned char *)s;
2360 e = q + size;
2361
2362 if (byteorder)
2363 bo = *byteorder;
2364
2365 /* Check for BOM marks (U+FEFF) in the input and adjust current
2366 byte order setting accordingly. In native mode, the leading BOM
2367 mark is skipped, in all other modes, it is copied to the output
2368 stream as-is (giving a ZWNBSP character). */
2369 if (bo == 0) {
2370 if (size >= 4) {
2371 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2372 (q[iorder[1]] << 8) | q[iorder[0]];
2373#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2374 if (bom == 0x0000FEFF) {
2375 q += 4;
2376 bo = -1;
2377 }
2378 else if (bom == 0xFFFE0000) {
2379 q += 4;
2380 bo = 1;
2381 }
2382#else
2383 if (bom == 0x0000FEFF) {
2384 q += 4;
2385 bo = 1;
2386 }
2387 else if (bom == 0xFFFE0000) {
2388 q += 4;
2389 bo = -1;
2390 }
2391#endif
2392 }
2393 }
2394
2395 if (bo == -1) {
2396 /* force LE */
2397 iorder[0] = 0;
2398 iorder[1] = 1;
2399 iorder[2] = 2;
2400 iorder[3] = 3;
2401 }
2402 else if (bo == 1) {
2403 /* force BE */
2404 iorder[0] = 3;
2405 iorder[1] = 2;
2406 iorder[2] = 1;
2407 iorder[3] = 0;
2408 }
2409
2410 while (q < e) {
2411 Py_UCS4 ch;
2412 /* remaining bytes at the end? (size should be divisible by 4) */
2413 if (e-q<4) {
2414 if (consumed)
2415 break;
2416 errmsg = "truncated data";
2417 startinpos = ((const char *)q)-starts;
2418 endinpos = ((const char *)e)-starts;
2419 goto utf32Error;
2420 /* The remaining input chars are ignored if the callback
2421 chooses to skip the input */
2422 }
2423 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2424 (q[iorder[1]] << 8) | q[iorder[0]];
2425
2426 if (ch >= 0x110000)
2427 {
2428 errmsg = "codepoint not in range(0x110000)";
2429 startinpos = ((const char *)q)-starts;
2430 endinpos = startinpos+4;
2431 goto utf32Error;
2432 }
2433#ifndef Py_UNICODE_WIDE
2434 if (ch >= 0x10000)
2435 {
2436 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2437 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2438 }
2439 else
2440#endif
2441 *p++ = ch;
2442 q += 4;
2443 continue;
2444 utf32Error:
2445 outpos = p-PyUnicode_AS_UNICODE(unicode);
2446 if (unicode_decode_call_errorhandler(
2447 errors, &errorHandler,
2448 "utf32", errmsg,
2449 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002450 &unicode, &outpos, &p))
Walter Dörwald41980ca2007-08-16 21:55:45 +00002451 goto onError;
2452 }
2453
2454 if (byteorder)
2455 *byteorder = bo;
2456
2457 if (consumed)
2458 *consumed = (const char *)q-starts;
2459
2460 /* Adjust length */
2461 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2462 goto onError;
2463
2464 Py_XDECREF(errorHandler);
2465 Py_XDECREF(exc);
2466 return (PyObject *)unicode;
2467
2468onError:
2469 Py_DECREF(unicode);
2470 Py_XDECREF(errorHandler);
2471 Py_XDECREF(exc);
2472 return NULL;
2473}
2474
2475PyObject *
2476PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2477 Py_ssize_t size,
2478 const char *errors,
2479 int byteorder)
2480{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002481 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002482 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002483 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002484#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002485 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002486#else
2487 const int pairs = 0;
2488#endif
2489 /* Offsets from p for storing byte pairs in the right order. */
2490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2491 int iorder[] = {0, 1, 2, 3};
2492#else
2493 int iorder[] = {3, 2, 1, 0};
2494#endif
2495
2496#define STORECHAR(CH) \
2497 do { \
2498 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2499 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2500 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2501 p[iorder[0]] = (CH) & 0xff; \
2502 p += 4; \
2503 } while(0)
2504
2505 /* In narrow builds we can output surrogate pairs as one codepoint,
2506 so we need less space. */
2507#ifndef Py_UNICODE_WIDE
2508 for (i = pairs = 0; i < size-1; i++)
2509 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2510 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2511 pairs++;
2512#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002513 nsize = (size - pairs + (byteorder == 0));
2514 bytesize = nsize * 4;
2515 if (bytesize / 4 != nsize)
2516 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002517 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002518 if (v == NULL)
2519 return NULL;
2520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002521 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002522 if (byteorder == 0)
2523 STORECHAR(0xFEFF);
2524 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002525 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002526
2527 if (byteorder == -1) {
2528 /* force LE */
2529 iorder[0] = 0;
2530 iorder[1] = 1;
2531 iorder[2] = 2;
2532 iorder[3] = 3;
2533 }
2534 else if (byteorder == 1) {
2535 /* force BE */
2536 iorder[0] = 3;
2537 iorder[1] = 2;
2538 iorder[2] = 1;
2539 iorder[3] = 0;
2540 }
2541
2542 while (size-- > 0) {
2543 Py_UCS4 ch = *s++;
2544#ifndef Py_UNICODE_WIDE
2545 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2546 Py_UCS4 ch2 = *s;
2547 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2548 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2549 s++;
2550 size--;
2551 }
2552 }
2553#endif
2554 STORECHAR(ch);
2555 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002556
2557 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002558 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002559#undef STORECHAR
2560}
2561
2562PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2563{
2564 if (!PyUnicode_Check(unicode)) {
2565 PyErr_BadArgument();
2566 return NULL;
2567 }
2568 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2569 PyUnicode_GET_SIZE(unicode),
2570 NULL,
2571 0);
2572}
2573
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574/* --- UTF-16 Codec ------------------------------------------------------- */
2575
Tim Peters772747b2001-08-09 22:21:55 +00002576PyObject *
2577PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002579 const char *errors,
2580 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581{
Walter Dörwald69652032004-09-07 20:24:22 +00002582 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2583}
2584
2585PyObject *
2586PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002587 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002588 const char *errors,
2589 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002590 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002593 Py_ssize_t startinpos;
2594 Py_ssize_t endinpos;
2595 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 PyUnicodeObject *unicode;
2597 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002598 const unsigned char *q, *e;
2599 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002600 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002601 /* Offsets from q for retrieving byte pairs in the right order. */
2602#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2603 int ihi = 1, ilo = 0;
2604#else
2605 int ihi = 0, ilo = 1;
2606#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 PyObject *errorHandler = NULL;
2608 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Note: size will always be longer than the resulting Unicode
2611 character count */
2612 unicode = _PyUnicode_New(size);
2613 if (!unicode)
2614 return NULL;
2615 if (size == 0)
2616 return (PyObject *)unicode;
2617
2618 /* Unpack UTF-16 encoded data */
2619 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002620 q = (unsigned char *)s;
2621 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622
2623 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002624 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002626 /* Check for BOM marks (U+FEFF) in the input and adjust current
2627 byte order setting accordingly. In native mode, the leading BOM
2628 mark is skipped, in all other modes, it is copied to the output
2629 stream as-is (giving a ZWNBSP character). */
2630 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002631 if (size >= 2) {
2632 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002633#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002634 if (bom == 0xFEFF) {
2635 q += 2;
2636 bo = -1;
2637 }
2638 else if (bom == 0xFFFE) {
2639 q += 2;
2640 bo = 1;
2641 }
Tim Petersced69f82003-09-16 20:30:58 +00002642#else
Walter Dörwald69652032004-09-07 20:24:22 +00002643 if (bom == 0xFEFF) {
2644 q += 2;
2645 bo = 1;
2646 }
2647 else if (bom == 0xFFFE) {
2648 q += 2;
2649 bo = -1;
2650 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002651#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002652 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654
Tim Peters772747b2001-08-09 22:21:55 +00002655 if (bo == -1) {
2656 /* force LE */
2657 ihi = 1;
2658 ilo = 0;
2659 }
2660 else if (bo == 1) {
2661 /* force BE */
2662 ihi = 0;
2663 ilo = 1;
2664 }
2665
2666 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002668 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002670 if (consumed)
2671 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 errmsg = "truncated data";
2673 startinpos = ((const char *)q)-starts;
2674 endinpos = ((const char *)e)-starts;
2675 goto utf16Error;
2676 /* The remaining input chars are ignored if the callback
2677 chooses to skip the input */
2678 }
2679 ch = (q[ihi] << 8) | q[ilo];
2680
Tim Peters772747b2001-08-09 22:21:55 +00002681 q += 2;
2682
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 if (ch < 0xD800 || ch > 0xDFFF) {
2684 *p++ = ch;
2685 continue;
2686 }
2687
2688 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002689 if (q >= e) {
2690 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 startinpos = (((const char *)q)-2)-starts;
2692 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002693 goto utf16Error;
2694 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002695 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002696 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2697 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002698 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002699#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002700 *p++ = ch;
2701 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002702#else
2703 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002704#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002705 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002706 }
2707 else {
2708 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 startinpos = (((const char *)q)-4)-starts;
2710 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002711 goto utf16Error;
2712 }
2713
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002715 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 startinpos = (((const char *)q)-2)-starts;
2717 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002718 /* Fall through to report the error */
2719
2720 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(unicode);
2722 if (unicode_decode_call_errorhandler(
2723 errors, &errorHandler,
2724 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002725 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002726 &unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 }
2729
2730 if (byteorder)
2731 *byteorder = bo;
2732
Walter Dörwald69652032004-09-07 20:24:22 +00002733 if (consumed)
2734 *consumed = (const char *)q-starts;
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002737 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 goto onError;
2739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 Py_XDECREF(errorHandler);
2741 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 return (PyObject *)unicode;
2743
2744onError:
2745 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 Py_XDECREF(errorHandler);
2747 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 return NULL;
2749}
2750
Tim Peters772747b2001-08-09 22:21:55 +00002751PyObject *
2752PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002753 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002754 const char *errors,
2755 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002757 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002758 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002759 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002760#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002761 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002762#else
2763 const int pairs = 0;
2764#endif
Tim Peters772747b2001-08-09 22:21:55 +00002765 /* Offsets from p for storing byte pairs in the right order. */
2766#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2767 int ihi = 1, ilo = 0;
2768#else
2769 int ihi = 0, ilo = 1;
2770#endif
2771
2772#define STORECHAR(CH) \
2773 do { \
2774 p[ihi] = ((CH) >> 8) & 0xff; \
2775 p[ilo] = (CH) & 0xff; \
2776 p += 2; \
2777 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002779#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002780 for (i = pairs = 0; i < size; i++)
2781 if (s[i] >= 0x10000)
2782 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002783#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002784 /* 2 * (size + pairs + (byteorder == 0)) */
2785 if (size > PY_SSIZE_T_MAX ||
2786 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2787 return PyErr_NoMemory();
2788 nsize = size + pairs + (byteorder == 0);
2789 bytesize = nsize * 2;
2790 if (bytesize / 2 != nsize)
2791 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002792 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 if (v == NULL)
2794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002796 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002798 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002799 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002800 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002801
2802 if (byteorder == -1) {
2803 /* force LE */
2804 ihi = 1;
2805 ilo = 0;
2806 }
2807 else if (byteorder == 1) {
2808 /* force BE */
2809 ihi = 0;
2810 ilo = 1;
2811 }
2812
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002813 while (size-- > 0) {
2814 Py_UNICODE ch = *s++;
2815 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002816#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002817 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002818 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2819 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002821#endif
Tim Peters772747b2001-08-09 22:21:55 +00002822 STORECHAR(ch);
2823 if (ch2)
2824 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826
2827 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002828 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002829#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830}
2831
2832PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2833{
2834 if (!PyUnicode_Check(unicode)) {
2835 PyErr_BadArgument();
2836 return NULL;
2837 }
2838 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2839 PyUnicode_GET_SIZE(unicode),
2840 NULL,
2841 0);
2842}
2843
2844/* --- Unicode Escape Codec ----------------------------------------------- */
2845
Fredrik Lundh06d12682001-01-24 07:59:11 +00002846static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002847
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 const char *errors)
2851{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t startinpos;
2854 Py_ssize_t endinpos;
2855 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002860 char* message;
2861 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 PyObject *errorHandler = NULL;
2863 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 /* Escaped strings will always be longer than the resulting
2866 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 length after conversion to the true value.
2868 (but if the error callback returns a long replacement string
2869 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 v = _PyUnicode_New(size);
2871 if (v == NULL)
2872 goto onError;
2873 if (size == 0)
2874 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 while (s < end) {
2880 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002881 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883
2884 /* Non-escape characters are interpreted as Unicode ordinals */
2885 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 continue;
2888 }
2889
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 /* \ - Escapes */
2892 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002893 c = *s++;
2894 if (s > end)
2895 c = '\0'; /* Invalid after \ */
2896 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897
2898 /* \x escapes */
2899 case '\n': break;
2900 case '\\': *p++ = '\\'; break;
2901 case '\'': *p++ = '\''; break;
2902 case '\"': *p++ = '\"'; break;
2903 case 'b': *p++ = '\b'; break;
2904 case 'f': *p++ = '\014'; break; /* FF */
2905 case 't': *p++ = '\t'; break;
2906 case 'n': *p++ = '\n'; break;
2907 case 'r': *p++ = '\r'; break;
2908 case 'v': *p++ = '\013'; break; /* VT */
2909 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2910
2911 /* \OOO (octal) escapes */
2912 case '0': case '1': case '2': case '3':
2913 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002914 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002915 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002916 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002917 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002918 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002920 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 break;
2922
Fredrik Lundhccc74732001-02-18 22:13:49 +00002923 /* hex escapes */
2924 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 digits = 2;
2927 message = "truncated \\xXX escape";
2928 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929
Fredrik Lundhccc74732001-02-18 22:13:49 +00002930 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002932 digits = 4;
2933 message = "truncated \\uXXXX escape";
2934 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Fredrik Lundhccc74732001-02-18 22:13:49 +00002936 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002937 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002938 digits = 8;
2939 message = "truncated \\UXXXXXXXX escape";
2940 hexescape:
2941 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 outpos = p-PyUnicode_AS_UNICODE(v);
2943 if (s+digits>end) {
2944 endinpos = size;
2945 if (unicode_decode_call_errorhandler(
2946 errors, &errorHandler,
2947 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002948 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002949 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 goto onError;
2951 goto nextByte;
2952 }
2953 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002954 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002955 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 endinpos = (s+i+1)-starts;
2957 if (unicode_decode_call_errorhandler(
2958 errors, &errorHandler,
2959 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002960 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002961 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002962 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002964 }
2965 chr = (chr<<4) & ~0xF;
2966 if (c >= '0' && c <= '9')
2967 chr += c - '0';
2968 else if (c >= 'a' && c <= 'f')
2969 chr += 10 + c - 'a';
2970 else
2971 chr += 10 + c - 'A';
2972 }
2973 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002974 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 /* _decoding_error will have already written into the
2976 target buffer. */
2977 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002978 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002979 /* when we get here, chr is a 32-bit unicode character */
2980 if (chr <= 0xffff)
2981 /* UCS-2 character */
2982 *p++ = (Py_UNICODE) chr;
2983 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002984 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002985 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002986#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002987 *p++ = chr;
2988#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002989 chr -= 0x10000L;
2990 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002991 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002992#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002993 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 endinpos = s-starts;
2995 outpos = p-PyUnicode_AS_UNICODE(v);
2996 if (unicode_decode_call_errorhandler(
2997 errors, &errorHandler,
2998 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002999 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003000 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003001 goto onError;
3002 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003003 break;
3004
3005 /* \N{name} */
3006 case 'N':
3007 message = "malformed \\N character escape";
3008 if (ucnhash_CAPI == NULL) {
3009 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003010 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003011 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003012 if (m == NULL)
3013 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003014 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003015 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003016 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003017 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003018 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003019 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003020 if (ucnhash_CAPI == NULL)
3021 goto ucnhashError;
3022 }
3023 if (*s == '{') {
3024 const char *start = s+1;
3025 /* look for the closing brace */
3026 while (*s != '}' && s < end)
3027 s++;
3028 if (s > start && s < end && *s == '}') {
3029 /* found a name. look it up in the unicode database */
3030 message = "unknown Unicode character name";
3031 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003032 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003033 goto store;
3034 }
3035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 endinpos = s-starts;
3037 outpos = p-PyUnicode_AS_UNICODE(v);
3038 if (unicode_decode_call_errorhandler(
3039 errors, &errorHandler,
3040 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003041 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003042 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003043 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003044 break;
3045
3046 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003047 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 message = "\\ at end of string";
3049 s--;
3050 endinpos = s-starts;
3051 outpos = p-PyUnicode_AS_UNICODE(v);
3052 if (unicode_decode_call_errorhandler(
3053 errors, &errorHandler,
3054 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003055 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003056 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003057 goto onError;
3058 }
3059 else {
3060 *p++ = '\\';
3061 *p++ = (unsigned char)s[-1];
3062 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003063 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 nextByte:
3066 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003068 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003073
Fredrik Lundhccc74732001-02-18 22:13:49 +00003074ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003075 PyErr_SetString(
3076 PyExc_UnicodeError,
3077 "\\N escapes not supported (can't load unicodedata module)"
3078 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003079 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 Py_XDECREF(errorHandler);
3081 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003082 return NULL;
3083
Fredrik Lundhccc74732001-02-18 22:13:49 +00003084onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 Py_XDECREF(errorHandler);
3087 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 return NULL;
3089}
3090
3091/* Return a Unicode-Escape string version of the Unicode object.
3092
3093 If quotes is true, the string is enclosed in u"" or u'' quotes as
3094 appropriate.
3095
3096*/
3097
Thomas Wouters477c8d52006-05-27 19:21:47 +00003098Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3099 Py_ssize_t size,
3100 Py_UNICODE ch)
3101{
3102 /* like wcschr, but doesn't stop at NULL characters */
3103
3104 while (size-- > 0) {
3105 if (*s == ch)
3106 return s;
3107 s++;
3108 }
3109
3110 return NULL;
3111}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003112
Walter Dörwald79e913e2007-05-12 11:08:06 +00003113static const char *hexdigits = "0123456789abcdef";
3114
3115PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3116 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003118 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003121#ifdef Py_UNICODE_WIDE
3122 const Py_ssize_t expandsize = 10;
3123#else
3124 const Py_ssize_t expandsize = 6;
3125#endif
3126
Thomas Wouters89f507f2006-12-13 04:49:30 +00003127 /* XXX(nnorwitz): rather than over-allocating, it would be
3128 better to choose a different scheme. Perhaps scan the
3129 first N-chars of the string and allocate based on that size.
3130 */
3131 /* Initial allocation is based on the longest-possible unichr
3132 escape.
3133
3134 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3135 unichr, so in this case it's the longest unichr escape. In
3136 narrow (UTF-16) builds this is five chars per source unichr
3137 since there are two unichrs in the surrogate pair, so in narrow
3138 (UTF-16) builds it's not the longest unichr escape.
3139
3140 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3141 so in the narrow (UTF-16) build case it's the longest unichr
3142 escape.
3143 */
3144
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003145 if (size == 0)
3146 return PyBytes_FromStringAndSize(NULL, 0);
3147
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003148 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3149 return PyErr_NoMemory();
3150
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003151 repr = PyBytes_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003152 2
3153 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003154 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (repr == NULL)
3156 return NULL;
3157
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003158 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 while (size-- > 0) {
3161 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003162
Walter Dörwald79e913e2007-05-12 11:08:06 +00003163 /* Escape backslashes */
3164 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 *p++ = '\\';
3166 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003167 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003168 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003169
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003170#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003171 /* Map 21-bit characters to '\U00xxxxxx' */
3172 else if (ch >= 0x10000) {
3173 *p++ = '\\';
3174 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003175 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3176 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3177 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3178 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3179 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3180 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3181 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3182 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003183 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003184 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003185#else
3186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003187 else if (ch >= 0xD800 && ch < 0xDC00) {
3188 Py_UNICODE ch2;
3189 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003190
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003191 ch2 = *s++;
3192 size--;
3193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3195 *p++ = '\\';
3196 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003197 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3198 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3199 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3200 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3201 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3202 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3203 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3204 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003205 continue;
3206 }
3207 /* Fall through: isolated surrogates are copied as-is */
3208 s--;
3209 size++;
3210 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003211#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003214 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 *p++ = '\\';
3216 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003217 *p++ = hexdigits[(ch >> 12) & 0x000F];
3218 *p++ = hexdigits[(ch >> 8) & 0x000F];
3219 *p++ = hexdigits[(ch >> 4) & 0x000F];
3220 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003222
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003223 /* Map special whitespace to '\t', \n', '\r' */
3224 else if (ch == '\t') {
3225 *p++ = '\\';
3226 *p++ = 't';
3227 }
3228 else if (ch == '\n') {
3229 *p++ = '\\';
3230 *p++ = 'n';
3231 }
3232 else if (ch == '\r') {
3233 *p++ = '\\';
3234 *p++ = 'r';
3235 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003236
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003237 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003238 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003240 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003241 *p++ = hexdigits[(ch >> 4) & 0x000F];
3242 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 /* Copy everything else as-is */
3246 else
3247 *p++ = (char) ch;
3248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003250 assert(p - PyBytes_AS_STRING(repr) > 0);
3251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3252 return NULL;
3253 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254}
3255
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003256PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003258 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 if (!PyUnicode_Check(unicode)) {
3260 PyErr_BadArgument();
3261 return NULL;
3262 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003263 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3264 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003265 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266}
3267
3268/* --- Raw Unicode Escape Codec ------------------------------------------- */
3269
3270PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003271 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 const char *errors)
3273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003275 Py_ssize_t startinpos;
3276 Py_ssize_t endinpos;
3277 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 const char *end;
3281 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 PyObject *errorHandler = NULL;
3283 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003284
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 /* Escaped strings will always be longer than the resulting
3286 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 length after conversion to the true value. (But decoding error
3288 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 v = _PyUnicode_New(size);
3290 if (v == NULL)
3291 goto onError;
3292 if (size == 0)
3293 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 end = s + size;
3296 while (s < end) {
3297 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003298 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301
3302 /* Non-escape characters are interpreted as Unicode ordinals */
3303 if (*s != '\\') {
3304 *p++ = (unsigned char)*s++;
3305 continue;
3306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308
3309 /* \u-escapes are only interpreted iff the number of leading
3310 backslashes if odd */
3311 bs = s;
3312 for (;s < end;) {
3313 if (*s != '\\')
3314 break;
3315 *p++ = (unsigned char)*s++;
3316 }
3317 if (((s - bs) & 1) == 0 ||
3318 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003319 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 continue;
3321 }
3322 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003323 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 s++;
3325
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003326 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003328 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003330 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 endinpos = s-starts;
3332 if (unicode_decode_call_errorhandler(
3333 errors, &errorHandler,
3334 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003335 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003336 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 }
3340 x = (x<<4) & ~0xF;
3341 if (c >= '0' && c <= '9')
3342 x += c - '0';
3343 else if (c >= 'a' && c <= 'f')
3344 x += 10 + c - 'a';
3345 else
3346 x += 10 + c - 'A';
3347 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003348 if (x <= 0xffff)
3349 /* UCS-2 character */
3350 *p++ = (Py_UNICODE) x;
3351 else if (x <= 0x10ffff) {
3352 /* UCS-4 character. Either store directly, or as
3353 surrogate pair. */
3354#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003355 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003356#else
3357 x -= 0x10000L;
3358 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3359 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3360#endif
3361 } else {
3362 endinpos = s-starts;
3363 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003364 if (unicode_decode_call_errorhandler(
3365 errors, &errorHandler,
3366 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003367 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003368 &v, &outpos, &p))
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003369 goto onError;
3370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 nextByte:
3372 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003374 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 Py_XDECREF(errorHandler);
3377 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003379
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 onError:
3381 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 Py_XDECREF(errorHandler);
3383 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 return NULL;
3385}
3386
3387PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003390 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 char *p;
3392 char *q;
3393
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003394#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003395 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003396#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003397 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003398#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003399
3400 if (size > PY_SSIZE_T_MAX / expandsize)
3401 return PyErr_NoMemory();
3402
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003403 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (repr == NULL)
3405 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003406 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003407 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003409 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 while (size-- > 0) {
3411 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003412#ifdef Py_UNICODE_WIDE
3413 /* Map 32-bit characters to '\Uxxxxxxxx' */
3414 if (ch >= 0x10000) {
3415 *p++ = '\\';
3416 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003417 *p++ = hexdigits[(ch >> 28) & 0xf];
3418 *p++ = hexdigits[(ch >> 24) & 0xf];
3419 *p++ = hexdigits[(ch >> 20) & 0xf];
3420 *p++ = hexdigits[(ch >> 16) & 0xf];
3421 *p++ = hexdigits[(ch >> 12) & 0xf];
3422 *p++ = hexdigits[(ch >> 8) & 0xf];
3423 *p++ = hexdigits[(ch >> 4) & 0xf];
3424 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003425 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003426 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003427#else
3428 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3429 if (ch >= 0xD800 && ch < 0xDC00) {
3430 Py_UNICODE ch2;
3431 Py_UCS4 ucs;
3432
3433 ch2 = *s++;
3434 size--;
3435 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3436 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3437 *p++ = '\\';
3438 *p++ = 'U';
3439 *p++ = hexdigits[(ucs >> 28) & 0xf];
3440 *p++ = hexdigits[(ucs >> 24) & 0xf];
3441 *p++ = hexdigits[(ucs >> 20) & 0xf];
3442 *p++ = hexdigits[(ucs >> 16) & 0xf];
3443 *p++ = hexdigits[(ucs >> 12) & 0xf];
3444 *p++ = hexdigits[(ucs >> 8) & 0xf];
3445 *p++ = hexdigits[(ucs >> 4) & 0xf];
3446 *p++ = hexdigits[ucs & 0xf];
3447 continue;
3448 }
3449 /* Fall through: isolated surrogates are copied as-is */
3450 s--;
3451 size++;
3452 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003453#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 /* Map 16-bit characters to '\uxxxx' */
3455 if (ch >= 256) {
3456 *p++ = '\\';
3457 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003458 *p++ = hexdigits[(ch >> 12) & 0xf];
3459 *p++ = hexdigits[(ch >> 8) & 0xf];
3460 *p++ = hexdigits[(ch >> 4) & 0xf];
3461 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 }
3463 /* Copy everything else as-is */
3464 else
3465 *p++ = (char) ch;
3466 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003467 size = p - q;
3468
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003469 assert(size > 0);
3470 if (_PyBytes_Resize(&repr, size) < 0)
3471 return NULL;
3472 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473}
3474
3475PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3476{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003477 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003479 PyErr_BadArgument();
3480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003482 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3483 PyUnicode_GET_SIZE(unicode));
3484
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003485 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486}
3487
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003488/* --- Unicode Internal Codec ------------------------------------------- */
3489
3490PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003491 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003492 const char *errors)
3493{
3494 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003495 Py_ssize_t startinpos;
3496 Py_ssize_t endinpos;
3497 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003498 PyUnicodeObject *v;
3499 Py_UNICODE *p;
3500 const char *end;
3501 const char *reason;
3502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504
Neal Norwitzd43069c2006-01-08 01:12:10 +00003505#ifdef Py_UNICODE_WIDE
3506 Py_UNICODE unimax = PyUnicode_GetMax();
3507#endif
3508
Thomas Wouters89f507f2006-12-13 04:49:30 +00003509 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003510 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3511 if (v == NULL)
3512 goto onError;
3513 if (PyUnicode_GetSize((PyObject *)v) == 0)
3514 return (PyObject *)v;
3515 p = PyUnicode_AS_UNICODE(v);
3516 end = s + size;
3517
3518 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003519 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003520 /* We have to sanity check the raw data, otherwise doom looms for
3521 some malformed UCS-4 data. */
3522 if (
3523 #ifdef Py_UNICODE_WIDE
3524 *p > unimax || *p < 0 ||
3525 #endif
3526 end-s < Py_UNICODE_SIZE
3527 )
3528 {
3529 startinpos = s - starts;
3530 if (end-s < Py_UNICODE_SIZE) {
3531 endinpos = end-starts;
3532 reason = "truncated input";
3533 }
3534 else {
3535 endinpos = s - starts + Py_UNICODE_SIZE;
3536 reason = "illegal code point (> 0x10FFFF)";
3537 }
3538 outpos = p - PyUnicode_AS_UNICODE(v);
3539 if (unicode_decode_call_errorhandler(
3540 errors, &errorHandler,
3541 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003542 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003543 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003544 goto onError;
3545 }
3546 }
3547 else {
3548 p++;
3549 s += Py_UNICODE_SIZE;
3550 }
3551 }
3552
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003553 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003554 goto onError;
3555 Py_XDECREF(errorHandler);
3556 Py_XDECREF(exc);
3557 return (PyObject *)v;
3558
3559 onError:
3560 Py_XDECREF(v);
3561 Py_XDECREF(errorHandler);
3562 Py_XDECREF(exc);
3563 return NULL;
3564}
3565
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566/* --- Latin-1 Codec ------------------------------------------------------ */
3567
3568PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 const char *errors)
3571{
3572 PyUnicodeObject *v;
3573 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003574
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003576 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003577 Py_UNICODE r = *(unsigned char*)s;
3578 return PyUnicode_FromUnicode(&r, 1);
3579 }
3580
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 v = _PyUnicode_New(size);
3582 if (v == NULL)
3583 goto onError;
3584 if (size == 0)
3585 return (PyObject *)v;
3586 p = PyUnicode_AS_UNICODE(v);
3587 while (size-- > 0)
3588 *p++ = (unsigned char)*s++;
3589 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003590
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 onError:
3592 Py_XDECREF(v);
3593 return NULL;
3594}
3595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596/* create or adjust a UnicodeEncodeError */
3597static void make_encode_exception(PyObject **exceptionObject,
3598 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003599 const Py_UNICODE *unicode, Py_ssize_t size,
3600 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 if (*exceptionObject == NULL) {
3604 *exceptionObject = PyUnicodeEncodeError_Create(
3605 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 }
3607 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3609 goto onError;
3610 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3611 goto onError;
3612 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3613 goto onError;
3614 return;
3615 onError:
3616 Py_DECREF(*exceptionObject);
3617 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 }
3619}
3620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621/* raises a UnicodeEncodeError */
3622static void raise_encode_exception(PyObject **exceptionObject,
3623 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003624 const Py_UNICODE *unicode, Py_ssize_t size,
3625 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 const char *reason)
3627{
3628 make_encode_exception(exceptionObject,
3629 encoding, unicode, size, startpos, endpos, reason);
3630 if (*exceptionObject != NULL)
3631 PyCodec_StrictErrors(*exceptionObject);
3632}
3633
3634/* error handling callback helper:
3635 build arguments, call the callback and check the arguments,
3636 put the result into newpos and return the replacement string, which
3637 has to be freed by the caller */
3638static PyObject *unicode_encode_call_errorhandler(const char *errors,
3639 PyObject **errorHandler,
3640 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003641 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3642 Py_ssize_t startpos, Py_ssize_t endpos,
3643 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003645 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646
3647 PyObject *restuple;
3648 PyObject *resunicode;
3649
3650 if (*errorHandler == NULL) {
3651 *errorHandler = PyCodec_LookupError(errors);
3652 if (*errorHandler == NULL)
3653 return NULL;
3654 }
3655
3656 make_encode_exception(exceptionObject,
3657 encoding, unicode, size, startpos, endpos, reason);
3658 if (*exceptionObject == NULL)
3659 return NULL;
3660
3661 restuple = PyObject_CallFunctionObjArgs(
3662 *errorHandler, *exceptionObject, NULL);
3663 if (restuple == NULL)
3664 return NULL;
3665 if (!PyTuple_Check(restuple)) {
3666 PyErr_Format(PyExc_TypeError, &argparse[4]);
3667 Py_DECREF(restuple);
3668 return NULL;
3669 }
3670 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3671 &resunicode, newpos)) {
3672 Py_DECREF(restuple);
3673 return NULL;
3674 }
3675 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003676 *newpos = size+*newpos;
3677 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003678 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003679 Py_DECREF(restuple);
3680 return NULL;
3681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 Py_INCREF(resunicode);
3683 Py_DECREF(restuple);
3684 return resunicode;
3685}
3686
3687static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 const char *errors,
3690 int limit)
3691{
3692 /* output object */
3693 PyObject *res;
3694 /* pointers to the beginning and end+1 of input */
3695 const Py_UNICODE *startp = p;
3696 const Py_UNICODE *endp = p + size;
3697 /* pointer to the beginning of the unencodable characters */
3698 /* const Py_UNICODE *badp = NULL; */
3699 /* pointer into the output */
3700 char *str;
3701 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003702 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003703 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3704 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 PyObject *errorHandler = NULL;
3706 PyObject *exc = NULL;
3707 /* the following variable is used for caching string comparisons
3708 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3709 int known_errorHandler = -1;
3710
3711 /* allocate enough for a simple encoding without
3712 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003713 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003714 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003715 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003717 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003718 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 ressize = size;
3720
3721 while (p<endp) {
3722 Py_UNICODE c = *p;
3723
3724 /* can we encode this? */
3725 if (c<limit) {
3726 /* no overflow check, because we know that the space is enough */
3727 *str++ = (char)c;
3728 ++p;
3729 }
3730 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003731 Py_ssize_t unicodepos = p-startp;
3732 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003734 Py_ssize_t repsize;
3735 Py_ssize_t newpos;
3736 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 Py_UNICODE *uni2;
3738 /* startpos for collecting unencodable chars */
3739 const Py_UNICODE *collstart = p;
3740 const Py_UNICODE *collend = p;
3741 /* find all unecodable characters */
3742 while ((collend < endp) && ((*collend)>=limit))
3743 ++collend;
3744 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3745 if (known_errorHandler==-1) {
3746 if ((errors==NULL) || (!strcmp(errors, "strict")))
3747 known_errorHandler = 1;
3748 else if (!strcmp(errors, "replace"))
3749 known_errorHandler = 2;
3750 else if (!strcmp(errors, "ignore"))
3751 known_errorHandler = 3;
3752 else if (!strcmp(errors, "xmlcharrefreplace"))
3753 known_errorHandler = 4;
3754 else
3755 known_errorHandler = 0;
3756 }
3757 switch (known_errorHandler) {
3758 case 1: /* strict */
3759 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3760 goto onError;
3761 case 2: /* replace */
3762 while (collstart++<collend)
3763 *str++ = '?'; /* fall through */
3764 case 3: /* ignore */
3765 p = collend;
3766 break;
3767 case 4: /* xmlcharrefreplace */
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003768 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 /* determine replacement size (temporarily (mis)uses p) */
3770 for (p = collstart, repsize = 0; p < collend; ++p) {
3771 if (*p<10)
3772 repsize += 2+1+1;
3773 else if (*p<100)
3774 repsize += 2+2+1;
3775 else if (*p<1000)
3776 repsize += 2+3+1;
3777 else if (*p<10000)
3778 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003779#ifndef Py_UNICODE_WIDE
3780 else
3781 repsize += 2+5+1;
3782#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 else if (*p<100000)
3784 repsize += 2+5+1;
3785 else if (*p<1000000)
3786 repsize += 2+6+1;
3787 else
3788 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003789#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 }
3791 requiredsize = respos+repsize+(endp-collend);
3792 if (requiredsize > ressize) {
3793 if (requiredsize<2*ressize)
3794 requiredsize = 2*ressize;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003795 if (_PyBytes_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 goto onError;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003797 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 ressize = requiredsize;
3799 }
3800 /* generate replacement (temporarily (mis)uses p) */
3801 for (p = collstart; p < collend; ++p) {
3802 str += sprintf(str, "&#%d;", (int)*p);
3803 }
3804 p = collend;
3805 break;
3806 default:
3807 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3808 encoding, reason, startp, size, &exc,
3809 collstart-startp, collend-startp, &newpos);
3810 if (repunicode == NULL)
3811 goto onError;
3812 /* need more space? (at least enough for what we
3813 have+the replacement+the rest of the string, so
3814 we won't have to check space for encodable characters) */
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003815 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 repsize = PyUnicode_GET_SIZE(repunicode);
3817 requiredsize = respos+repsize+(endp-collend);
3818 if (requiredsize > ressize) {
3819 if (requiredsize<2*ressize)
3820 requiredsize = 2*ressize;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003821 if (_PyBytes_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 Py_DECREF(repunicode);
3823 goto onError;
3824 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003825 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826 ressize = requiredsize;
3827 }
3828 /* check if there is anything unencodable in the replacement
3829 and copy it to the output */
3830 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3831 c = *uni2;
3832 if (c >= limit) {
3833 raise_encode_exception(&exc, encoding, startp, size,
3834 unicodepos, unicodepos+1, reason);
3835 Py_DECREF(repunicode);
3836 goto onError;
3837 }
3838 *str = (char)c;
3839 }
3840 p = startp + newpos;
3841 Py_DECREF(repunicode);
3842 }
3843 }
3844 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003845 /* Resize if we allocated to much */
3846 size = str - PyBytes_AS_STRING(res);
3847 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00003848 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003849 if (_PyBytes_Resize(&res, size) < 0)
3850 goto onError;
3851 }
3852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 Py_XDECREF(errorHandler);
3854 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003855 return res;
3856
3857 onError:
3858 Py_XDECREF(res);
3859 Py_XDECREF(errorHandler);
3860 Py_XDECREF(exc);
3861 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862}
3863
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003865 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 const char *errors)
3867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869}
3870
3871PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3872{
3873 if (!PyUnicode_Check(unicode)) {
3874 PyErr_BadArgument();
3875 return NULL;
3876 }
3877 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3878 PyUnicode_GET_SIZE(unicode),
3879 NULL);
3880}
3881
3882/* --- 7-bit ASCII Codec -------------------------------------------------- */
3883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003885 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 const char *errors)
3887{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 PyUnicodeObject *v;
3890 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 Py_ssize_t startinpos;
3892 Py_ssize_t endinpos;
3893 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 const char *e;
3895 PyObject *errorHandler = NULL;
3896 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003899 if (size == 1 && *(unsigned char*)s < 128) {
3900 Py_UNICODE r = *(unsigned char*)s;
3901 return PyUnicode_FromUnicode(&r, 1);
3902 }
Tim Petersced69f82003-09-16 20:30:58 +00003903
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 v = _PyUnicode_New(size);
3905 if (v == NULL)
3906 goto onError;
3907 if (size == 0)
3908 return (PyObject *)v;
3909 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 e = s + size;
3911 while (s < e) {
3912 register unsigned char c = (unsigned char)*s;
3913 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 ++s;
3916 }
3917 else {
3918 startinpos = s-starts;
3919 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003920 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 if (unicode_decode_call_errorhandler(
3922 errors, &errorHandler,
3923 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003924 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003925 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003929 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003930 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 Py_XDECREF(errorHandler);
3933 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 onError:
3937 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 Py_XDECREF(errorHandler);
3939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 return NULL;
3941}
3942
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003944 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 const char *errors)
3946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948}
3949
3950PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3951{
3952 if (!PyUnicode_Check(unicode)) {
3953 PyErr_BadArgument();
3954 return NULL;
3955 }
3956 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3957 PyUnicode_GET_SIZE(unicode),
3958 NULL);
3959}
3960
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003961#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003962
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003963/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003964
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003965#if SIZEOF_INT < SIZEOF_SSIZE_T
3966#define NEED_RETRY
3967#endif
3968
3969/* XXX This code is limited to "true" double-byte encodings, as
3970 a) it assumes an incomplete character consists of a single byte, and
3971 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3972 encodings, see IsDBCSLeadByteEx documentation. */
3973
3974static int is_dbcs_lead_byte(const char *s, int offset)
3975{
3976 const char *curr = s + offset;
3977
3978 if (IsDBCSLeadByte(*curr)) {
3979 const char *prev = CharPrev(s, curr);
3980 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3981 }
3982 return 0;
3983}
3984
3985/*
3986 * Decode MBCS string into unicode object. If 'final' is set, converts
3987 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3988 */
3989static int decode_mbcs(PyUnicodeObject **v,
3990 const char *s, /* MBCS string */
3991 int size, /* sizeof MBCS string */
3992 int final)
3993{
3994 Py_UNICODE *p;
3995 Py_ssize_t n = 0;
3996 int usize = 0;
3997
3998 assert(size >= 0);
3999
4000 /* Skip trailing lead-byte unless 'final' is set */
4001 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4002 --size;
4003
4004 /* First get the size of the result */
4005 if (size > 0) {
4006 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4007 if (usize == 0) {
4008 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4009 return -1;
4010 }
4011 }
4012
4013 if (*v == NULL) {
4014 /* Create unicode object */
4015 *v = _PyUnicode_New(usize);
4016 if (*v == NULL)
4017 return -1;
4018 }
4019 else {
4020 /* Extend unicode object */
4021 n = PyUnicode_GET_SIZE(*v);
4022 if (_PyUnicode_Resize(v, n + usize) < 0)
4023 return -1;
4024 }
4025
4026 /* Do the conversion */
4027 if (size > 0) {
4028 p = PyUnicode_AS_UNICODE(*v) + n;
4029 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4030 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4031 return -1;
4032 }
4033 }
4034
4035 return size;
4036}
4037
4038PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4039 Py_ssize_t size,
4040 const char *errors,
4041 Py_ssize_t *consumed)
4042{
4043 PyUnicodeObject *v = NULL;
4044 int done;
4045
4046 if (consumed)
4047 *consumed = 0;
4048
4049#ifdef NEED_RETRY
4050 retry:
4051 if (size > INT_MAX)
4052 done = decode_mbcs(&v, s, INT_MAX, 0);
4053 else
4054#endif
4055 done = decode_mbcs(&v, s, (int)size, !consumed);
4056
4057 if (done < 0) {
4058 Py_XDECREF(v);
4059 return NULL;
4060 }
4061
4062 if (consumed)
4063 *consumed += done;
4064
4065#ifdef NEED_RETRY
4066 if (size > INT_MAX) {
4067 s += done;
4068 size -= done;
4069 goto retry;
4070 }
4071#endif
4072
4073 return (PyObject *)v;
4074}
4075
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004076PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004077 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078 const char *errors)
4079{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004080 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4081}
4082
4083/*
4084 * Convert unicode into string object (MBCS).
4085 * Returns 0 if succeed, -1 otherwise.
4086 */
4087static int encode_mbcs(PyObject **repr,
4088 const Py_UNICODE *p, /* unicode */
4089 int size) /* size of unicode */
4090{
4091 int mbcssize = 0;
4092 Py_ssize_t n = 0;
4093
4094 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004095
4096 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004097 if (size > 0) {
4098 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4099 if (mbcssize == 0) {
4100 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4101 return -1;
4102 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004103 }
4104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004105 if (*repr == NULL) {
4106 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004107 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004108 if (*repr == NULL)
4109 return -1;
4110 }
4111 else {
4112 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004113 n = PyBytes_Size(*repr);
Hirokazu Yamamotod88e8fa2008-12-27 14:58:17 +00004114 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004115 return -1;
4116 }
4117
4118 /* Do the conversion */
4119 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004120 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004121 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4122 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4123 return -1;
4124 }
4125 }
4126
4127 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004128}
4129
4130PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004132 const char *errors)
4133{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004134 PyObject *repr = NULL;
4135 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004137#ifdef NEED_RETRY
4138 retry:
4139 if (size > INT_MAX)
4140 ret = encode_mbcs(&repr, p, INT_MAX);
4141 else
4142#endif
4143 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004144
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004145 if (ret < 0) {
4146 Py_XDECREF(repr);
4147 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004148 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004149
4150#ifdef NEED_RETRY
4151 if (size > INT_MAX) {
4152 p += INT_MAX;
4153 size -= INT_MAX;
4154 goto retry;
4155 }
4156#endif
4157
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004158 return repr;
4159}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004160
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004161PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4162{
4163 if (!PyUnicode_Check(unicode)) {
4164 PyErr_BadArgument();
4165 return NULL;
4166 }
4167 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4168 PyUnicode_GET_SIZE(unicode),
4169 NULL);
4170}
4171
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004172#undef NEED_RETRY
4173
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004174#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176/* --- Character Mapping Codec -------------------------------------------- */
4177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 PyObject *mapping,
4181 const char *errors)
4182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004184 Py_ssize_t startinpos;
4185 Py_ssize_t endinpos;
4186 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 PyUnicodeObject *v;
4189 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004190 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 PyObject *errorHandler = NULL;
4192 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004193 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004195
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 /* Default to Latin-1 */
4197 if (mapping == NULL)
4198 return PyUnicode_DecodeLatin1(s, size, errors);
4199
4200 v = _PyUnicode_New(size);
4201 if (v == NULL)
4202 goto onError;
4203 if (size == 0)
4204 return (PyObject *)v;
4205 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004207 if (PyUnicode_CheckExact(mapping)) {
4208 mapstring = PyUnicode_AS_UNICODE(mapping);
4209 maplen = PyUnicode_GET_SIZE(mapping);
4210 while (s < e) {
4211 unsigned char ch = *s;
4212 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004214 if (ch < maplen)
4215 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004217 if (x == 0xfffe) {
4218 /* undefined mapping */
4219 outpos = p-PyUnicode_AS_UNICODE(v);
4220 startinpos = s-starts;
4221 endinpos = startinpos+1;
4222 if (unicode_decode_call_errorhandler(
4223 errors, &errorHandler,
4224 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004225 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004226 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004227 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004228 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004229 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004230 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 *p++ = x;
4232 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004234 }
4235 else {
4236 while (s < e) {
4237 unsigned char ch = *s;
4238 PyObject *w, *x;
4239
4240 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004241 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004242 if (w == NULL)
4243 goto onError;
4244 x = PyObject_GetItem(mapping, w);
4245 Py_DECREF(w);
4246 if (x == NULL) {
4247 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4248 /* No mapping found means: mapping is undefined. */
4249 PyErr_Clear();
4250 x = Py_None;
4251 Py_INCREF(x);
4252 } else
4253 goto onError;
4254 }
4255
4256 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004257 if (PyLong_Check(x)) {
4258 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004259 if (value < 0 || value > 65535) {
4260 PyErr_SetString(PyExc_TypeError,
4261 "character mapping must be in range(65536)");
4262 Py_DECREF(x);
4263 goto onError;
4264 }
4265 *p++ = (Py_UNICODE)value;
4266 }
4267 else if (x == Py_None) {
4268 /* undefined mapping */
4269 outpos = p-PyUnicode_AS_UNICODE(v);
4270 startinpos = s-starts;
4271 endinpos = startinpos+1;
4272 if (unicode_decode_call_errorhandler(
4273 errors, &errorHandler,
4274 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004276 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004277 Py_DECREF(x);
4278 goto onError;
4279 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004280 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004281 continue;
4282 }
4283 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004284 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004285
4286 if (targetsize == 1)
4287 /* 1-1 mapping */
4288 *p++ = *PyUnicode_AS_UNICODE(x);
4289
4290 else if (targetsize > 1) {
4291 /* 1-n mapping */
4292 if (targetsize > extrachars) {
4293 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004294 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4295 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004296 (targetsize << 2);
4297 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004298 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004299 if (_PyUnicode_Resize(&v,
4300 PyUnicode_GET_SIZE(v) + needed) < 0) {
4301 Py_DECREF(x);
4302 goto onError;
4303 }
4304 p = PyUnicode_AS_UNICODE(v) + oldpos;
4305 }
4306 Py_UNICODE_COPY(p,
4307 PyUnicode_AS_UNICODE(x),
4308 targetsize);
4309 p += targetsize;
4310 extrachars -= targetsize;
4311 }
4312 /* 1-0 mapping: skip the character */
4313 }
4314 else {
4315 /* wrong return value */
4316 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004317 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004318 Py_DECREF(x);
4319 goto onError;
4320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004322 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 }
4325 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004326 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004331
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 Py_XDECREF(errorHandler);
4334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 Py_XDECREF(v);
4336 return NULL;
4337}
4338
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004339/* Charmap encoding: the lookup table */
4340
4341struct encoding_map{
4342 PyObject_HEAD
4343 unsigned char level1[32];
4344 int count2, count3;
4345 unsigned char level23[1];
4346};
4347
4348static PyObject*
4349encoding_map_size(PyObject *obj, PyObject* args)
4350{
4351 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004352 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004353 128*map->count3);
4354}
4355
4356static PyMethodDef encoding_map_methods[] = {
4357 {"size", encoding_map_size, METH_NOARGS,
4358 PyDoc_STR("Return the size (in bytes) of this object") },
4359 { 0 }
4360};
4361
4362static void
4363encoding_map_dealloc(PyObject* o)
4364{
4365 PyObject_FREE(o);
4366}
4367
4368static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004369 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004370 "EncodingMap", /*tp_name*/
4371 sizeof(struct encoding_map), /*tp_basicsize*/
4372 0, /*tp_itemsize*/
4373 /* methods */
4374 encoding_map_dealloc, /*tp_dealloc*/
4375 0, /*tp_print*/
4376 0, /*tp_getattr*/
4377 0, /*tp_setattr*/
4378 0, /*tp_compare*/
4379 0, /*tp_repr*/
4380 0, /*tp_as_number*/
4381 0, /*tp_as_sequence*/
4382 0, /*tp_as_mapping*/
4383 0, /*tp_hash*/
4384 0, /*tp_call*/
4385 0, /*tp_str*/
4386 0, /*tp_getattro*/
4387 0, /*tp_setattro*/
4388 0, /*tp_as_buffer*/
4389 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4390 0, /*tp_doc*/
4391 0, /*tp_traverse*/
4392 0, /*tp_clear*/
4393 0, /*tp_richcompare*/
4394 0, /*tp_weaklistoffset*/
4395 0, /*tp_iter*/
4396 0, /*tp_iternext*/
4397 encoding_map_methods, /*tp_methods*/
4398 0, /*tp_members*/
4399 0, /*tp_getset*/
4400 0, /*tp_base*/
4401 0, /*tp_dict*/
4402 0, /*tp_descr_get*/
4403 0, /*tp_descr_set*/
4404 0, /*tp_dictoffset*/
4405 0, /*tp_init*/
4406 0, /*tp_alloc*/
4407 0, /*tp_new*/
4408 0, /*tp_free*/
4409 0, /*tp_is_gc*/
4410};
4411
4412PyObject*
4413PyUnicode_BuildEncodingMap(PyObject* string)
4414{
4415 Py_UNICODE *decode;
4416 PyObject *result;
4417 struct encoding_map *mresult;
4418 int i;
4419 int need_dict = 0;
4420 unsigned char level1[32];
4421 unsigned char level2[512];
4422 unsigned char *mlevel1, *mlevel2, *mlevel3;
4423 int count2 = 0, count3 = 0;
4424
4425 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4426 PyErr_BadArgument();
4427 return NULL;
4428 }
4429 decode = PyUnicode_AS_UNICODE(string);
4430 memset(level1, 0xFF, sizeof level1);
4431 memset(level2, 0xFF, sizeof level2);
4432
4433 /* If there isn't a one-to-one mapping of NULL to \0,
4434 or if there are non-BMP characters, we need to use
4435 a mapping dictionary. */
4436 if (decode[0] != 0)
4437 need_dict = 1;
4438 for (i = 1; i < 256; i++) {
4439 int l1, l2;
4440 if (decode[i] == 0
4441 #ifdef Py_UNICODE_WIDE
4442 || decode[i] > 0xFFFF
4443 #endif
4444 ) {
4445 need_dict = 1;
4446 break;
4447 }
4448 if (decode[i] == 0xFFFE)
4449 /* unmapped character */
4450 continue;
4451 l1 = decode[i] >> 11;
4452 l2 = decode[i] >> 7;
4453 if (level1[l1] == 0xFF)
4454 level1[l1] = count2++;
4455 if (level2[l2] == 0xFF)
4456 level2[l2] = count3++;
4457 }
4458
4459 if (count2 >= 0xFF || count3 >= 0xFF)
4460 need_dict = 1;
4461
4462 if (need_dict) {
4463 PyObject *result = PyDict_New();
4464 PyObject *key, *value;
4465 if (!result)
4466 return NULL;
4467 for (i = 0; i < 256; i++) {
4468 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004469 key = PyLong_FromLong(decode[i]);
4470 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004471 if (!key || !value)
4472 goto failed1;
4473 if (PyDict_SetItem(result, key, value) == -1)
4474 goto failed1;
4475 Py_DECREF(key);
4476 Py_DECREF(value);
4477 }
4478 return result;
4479 failed1:
4480 Py_XDECREF(key);
4481 Py_XDECREF(value);
4482 Py_DECREF(result);
4483 return NULL;
4484 }
4485
4486 /* Create a three-level trie */
4487 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4488 16*count2 + 128*count3 - 1);
4489 if (!result)
4490 return PyErr_NoMemory();
4491 PyObject_Init(result, &EncodingMapType);
4492 mresult = (struct encoding_map*)result;
4493 mresult->count2 = count2;
4494 mresult->count3 = count3;
4495 mlevel1 = mresult->level1;
4496 mlevel2 = mresult->level23;
4497 mlevel3 = mresult->level23 + 16*count2;
4498 memcpy(mlevel1, level1, 32);
4499 memset(mlevel2, 0xFF, 16*count2);
4500 memset(mlevel3, 0, 128*count3);
4501 count3 = 0;
4502 for (i = 1; i < 256; i++) {
4503 int o1, o2, o3, i2, i3;
4504 if (decode[i] == 0xFFFE)
4505 /* unmapped character */
4506 continue;
4507 o1 = decode[i]>>11;
4508 o2 = (decode[i]>>7) & 0xF;
4509 i2 = 16*mlevel1[o1] + o2;
4510 if (mlevel2[i2] == 0xFF)
4511 mlevel2[i2] = count3++;
4512 o3 = decode[i] & 0x7F;
4513 i3 = 128*mlevel2[i2] + o3;
4514 mlevel3[i3] = i;
4515 }
4516 return result;
4517}
4518
4519static int
4520encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4521{
4522 struct encoding_map *map = (struct encoding_map*)mapping;
4523 int l1 = c>>11;
4524 int l2 = (c>>7) & 0xF;
4525 int l3 = c & 0x7F;
4526 int i;
4527
4528#ifdef Py_UNICODE_WIDE
4529 if (c > 0xFFFF) {
4530 return -1;
4531 }
4532#endif
4533 if (c == 0)
4534 return 0;
4535 /* level 1*/
4536 i = map->level1[l1];
4537 if (i == 0xFF) {
4538 return -1;
4539 }
4540 /* level 2*/
4541 i = map->level23[16*i+l2];
4542 if (i == 0xFF) {
4543 return -1;
4544 }
4545 /* level 3 */
4546 i = map->level23[16*map->count2 + 128*i + l3];
4547 if (i == 0) {
4548 return -1;
4549 }
4550 return i;
4551}
4552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553/* Lookup the character ch in the mapping. If the character
4554 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004555 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Christian Heimes217cfd12007-12-02 14:31:20 +00004558 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 PyObject *x;
4560
4561 if (w == NULL)
4562 return NULL;
4563 x = PyObject_GetItem(mapping, w);
4564 Py_DECREF(w);
4565 if (x == NULL) {
4566 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4567 /* No mapping found means: mapping is undefined. */
4568 PyErr_Clear();
4569 x = Py_None;
4570 Py_INCREF(x);
4571 return x;
4572 } else
4573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004575 else if (x == Py_None)
4576 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004577 else if (PyLong_Check(x)) {
4578 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 if (value < 0 || value > 255) {
4580 PyErr_SetString(PyExc_TypeError,
4581 "character mapping must be in range(256)");
4582 Py_DECREF(x);
4583 return NULL;
4584 }
4585 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004587 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004591 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004592 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004593 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 Py_DECREF(x);
4595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 }
4597}
4598
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004599static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004600charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601{
Christian Heimes72b710a2008-05-26 13:28:38 +00004602 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004603 /* exponentially overallocate to minimize reallocations */
4604 if (requiredsize < 2*outsize)
4605 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004606 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004607 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004608 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004609}
4610
4611typedef enum charmapencode_result {
4612 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4613}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004615 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 space is available. Return a new reference to the object that
4617 was put in the output buffer, or Py_None, if the mapping was undefined
4618 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004619 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004621charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004624 PyObject *rep;
4625 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004626 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627
Christian Heimes90aa7642007-12-19 02:45:37 +00004628 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004629 int res = encoding_map_lookup(c, mapping);
4630 Py_ssize_t requiredsize = *outpos+1;
4631 if (res == -1)
4632 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004633 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004634 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004635 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004636 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004637 outstart[(*outpos)++] = (char)res;
4638 return enc_SUCCESS;
4639 }
4640
4641 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004643 return enc_EXCEPTION;
4644 else if (rep==Py_None) {
4645 Py_DECREF(rep);
4646 return enc_FAILED;
4647 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004648 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004650 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004651 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004653 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004655 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004656 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 }
4658 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004659 const char *repchars = PyBytes_AS_STRING(rep);
4660 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004662 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004663 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004665 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004667 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 memcpy(outstart + *outpos, repchars, repsize);
4669 *outpos += repsize;
4670 }
4671 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004672 Py_DECREF(rep);
4673 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674}
4675
4676/* handle an error in PyUnicode_EncodeCharmap
4677 Return 0 on success, -1 on error */
4678static
4679int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004680 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004682 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004683 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684{
4685 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 Py_ssize_t repsize;
4687 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 Py_UNICODE *uni2;
4689 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 Py_ssize_t collstartpos = *inpos;
4691 Py_ssize_t collendpos = *inpos+1;
4692 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 char *encoding = "charmap";
4694 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004695 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 /* find all unencodable characters */
4698 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004699 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004700 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004701 int res = encoding_map_lookup(p[collendpos], mapping);
4702 if (res != -1)
4703 break;
4704 ++collendpos;
4705 continue;
4706 }
4707
4708 rep = charmapencode_lookup(p[collendpos], mapping);
4709 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004711 else if (rep!=Py_None) {
4712 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713 break;
4714 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004715 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 ++collendpos;
4717 }
4718 /* cache callback name lookup
4719 * (if not done yet, i.e. it's the first error) */
4720 if (*known_errorHandler==-1) {
4721 if ((errors==NULL) || (!strcmp(errors, "strict")))
4722 *known_errorHandler = 1;
4723 else if (!strcmp(errors, "replace"))
4724 *known_errorHandler = 2;
4725 else if (!strcmp(errors, "ignore"))
4726 *known_errorHandler = 3;
4727 else if (!strcmp(errors, "xmlcharrefreplace"))
4728 *known_errorHandler = 4;
4729 else
4730 *known_errorHandler = 0;
4731 }
4732 switch (*known_errorHandler) {
4733 case 1: /* strict */
4734 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4735 return -1;
4736 case 2: /* replace */
4737 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4738 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004739 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 return -1;
4741 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004742 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4744 return -1;
4745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 }
4747 /* fall through */
4748 case 3: /* ignore */
4749 *inpos = collendpos;
4750 break;
4751 case 4: /* xmlcharrefreplace */
4752 /* generate replacement (temporarily (mis)uses p) */
4753 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4754 char buffer[2+29+1+1];
4755 char *cp;
4756 sprintf(buffer, "&#%d;", (int)p[collpos]);
4757 for (cp = buffer; *cp; ++cp) {
4758 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004759 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004761 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4763 return -1;
4764 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 }
4766 }
4767 *inpos = collendpos;
4768 break;
4769 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004770 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 encoding, reason, p, size, exceptionObject,
4772 collstartpos, collendpos, &newpos);
4773 if (repunicode == NULL)
4774 return -1;
4775 /* generate replacement */
4776 repsize = PyUnicode_GET_SIZE(repunicode);
4777 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4778 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004779 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 return -1;
4781 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004782 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4785 return -1;
4786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 }
4788 *inpos = newpos;
4789 Py_DECREF(repunicode);
4790 }
4791 return 0;
4792}
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004795 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 PyObject *mapping,
4797 const char *errors)
4798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 /* output object */
4800 PyObject *res = NULL;
4801 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 PyObject *errorHandler = NULL;
4806 PyObject *exc = NULL;
4807 /* the following variable is used for caching string comparisons
4808 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4809 * 3=ignore, 4=xmlcharrefreplace */
4810 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812 /* Default to Latin-1 */
4813 if (mapping == NULL)
4814 return PyUnicode_EncodeLatin1(p, size, errors);
4815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 /* allocate enough for a simple encoding without
4817 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004818 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 if (res == NULL)
4820 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004821 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 while (inpos<size) {
4825 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004826 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004827 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004829 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 if (charmap_encoding_error(p, size, &inpos, mapping,
4831 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004832 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004833 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004834 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 else
4838 /* done with this character => adjust input position */
4839 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004843 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004844 if (_PyBytes_Resize(&res, respos) < 0)
4845 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 Py_XDECREF(exc);
4848 Py_XDECREF(errorHandler);
4849 return res;
4850
4851 onError:
4852 Py_XDECREF(res);
4853 Py_XDECREF(exc);
4854 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 return NULL;
4856}
4857
4858PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4859 PyObject *mapping)
4860{
4861 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4862 PyErr_BadArgument();
4863 return NULL;
4864 }
4865 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4866 PyUnicode_GET_SIZE(unicode),
4867 mapping,
4868 NULL);
4869}
4870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871/* create or adjust a UnicodeTranslateError */
4872static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 const Py_UNICODE *unicode, Py_ssize_t size,
4874 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 if (*exceptionObject == NULL) {
4878 *exceptionObject = PyUnicodeTranslateError_Create(
4879 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 }
4881 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4883 goto onError;
4884 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4885 goto onError;
4886 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4887 goto onError;
4888 return;
4889 onError:
4890 Py_DECREF(*exceptionObject);
4891 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 }
4893}
4894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895/* raises a UnicodeTranslateError */
4896static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 const Py_UNICODE *unicode, Py_ssize_t size,
4898 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 const char *reason)
4900{
4901 make_translate_exception(exceptionObject,
4902 unicode, size, startpos, endpos, reason);
4903 if (*exceptionObject != NULL)
4904 PyCodec_StrictErrors(*exceptionObject);
4905}
4906
4907/* error handling callback helper:
4908 build arguments, call the callback and check the arguments,
4909 put the result into newpos and return the replacement string, which
4910 has to be freed by the caller */
4911static PyObject *unicode_translate_call_errorhandler(const char *errors,
4912 PyObject **errorHandler,
4913 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4915 Py_ssize_t startpos, Py_ssize_t endpos,
4916 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004918 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004920 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 PyObject *restuple;
4922 PyObject *resunicode;
4923
4924 if (*errorHandler == NULL) {
4925 *errorHandler = PyCodec_LookupError(errors);
4926 if (*errorHandler == NULL)
4927 return NULL;
4928 }
4929
4930 make_translate_exception(exceptionObject,
4931 unicode, size, startpos, endpos, reason);
4932 if (*exceptionObject == NULL)
4933 return NULL;
4934
4935 restuple = PyObject_CallFunctionObjArgs(
4936 *errorHandler, *exceptionObject, NULL);
4937 if (restuple == NULL)
4938 return NULL;
4939 if (!PyTuple_Check(restuple)) {
4940 PyErr_Format(PyExc_TypeError, &argparse[4]);
4941 Py_DECREF(restuple);
4942 return NULL;
4943 }
4944 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004945 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 Py_DECREF(restuple);
4947 return NULL;
4948 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004949 if (i_newpos<0)
4950 *newpos = size+i_newpos;
4951 else
4952 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004953 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004954 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004955 Py_DECREF(restuple);
4956 return NULL;
4957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 Py_INCREF(resunicode);
4959 Py_DECREF(restuple);
4960 return resunicode;
4961}
4962
4963/* Lookup the character ch in the mapping and put the result in result,
4964 which must be decrefed by the caller.
4965 Return 0 on success, -1 on error */
4966static
4967int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4968{
Christian Heimes217cfd12007-12-02 14:31:20 +00004969 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 PyObject *x;
4971
4972 if (w == NULL)
4973 return -1;
4974 x = PyObject_GetItem(mapping, w);
4975 Py_DECREF(w);
4976 if (x == NULL) {
4977 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4978 /* No mapping found means: use 1:1 mapping. */
4979 PyErr_Clear();
4980 *result = NULL;
4981 return 0;
4982 } else
4983 return -1;
4984 }
4985 else if (x == Py_None) {
4986 *result = x;
4987 return 0;
4988 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004989 else if (PyLong_Check(x)) {
4990 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 long max = PyUnicode_GetMax();
4992 if (value < 0 || value > max) {
4993 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004994 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 Py_DECREF(x);
4996 return -1;
4997 }
4998 *result = x;
4999 return 0;
5000 }
5001 else if (PyUnicode_Check(x)) {
5002 *result = x;
5003 return 0;
5004 }
5005 else {
5006 /* wrong return value */
5007 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005008 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005009 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 return -1;
5011 }
5012}
5013/* ensure that *outobj is at least requiredsize characters long,
5014if not reallocate and adjust various state variables.
5015Return 0 on success, -1 on error */
5016static
Walter Dörwald4894c302003-10-24 14:25:28 +00005017int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005018 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005020 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005021 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005023 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005025 if (requiredsize < 2 * oldsize)
5026 requiredsize = 2 * oldsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005027 if (PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 return -1;
5029 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 }
5031 return 0;
5032}
5033/* lookup the character, put the result in the output string and adjust
5034 various state variables. Return a new reference to the object that
5035 was put in the output buffer in *result, or Py_None, if the mapping was
5036 undefined (in which case no character was written).
5037 The called must decref result.
5038 Return 0 on success, -1 on error. */
5039static
Walter Dörwald4894c302003-10-24 14:25:28 +00005040int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005041 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005042 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043{
Walter Dörwald4894c302003-10-24 14:25:28 +00005044 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 return -1;
5046 if (*res==NULL) {
5047 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005048 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 }
5050 else if (*res==Py_None)
5051 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005052 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005054 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 }
5056 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005057 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 if (repsize==1) {
5059 /* no overflow check, because we know that the space is enough */
5060 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5061 }
5062 else if (repsize!=0) {
5063 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005064 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005065 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005066 repsize - 1;
5067 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 return -1;
5069 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5070 *outp += repsize;
5071 }
5072 }
5073 else
5074 return -1;
5075 return 0;
5076}
5077
5078PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005079 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 PyObject *mapping,
5081 const char *errors)
5082{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 /* output object */
5084 PyObject *res = NULL;
5085 /* pointers to the beginning and end+1 of input */
5086 const Py_UNICODE *startp = p;
5087 const Py_UNICODE *endp = p + size;
5088 /* pointer into the output */
5089 Py_UNICODE *str;
5090 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005091 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 char *reason = "character maps to <undefined>";
5093 PyObject *errorHandler = NULL;
5094 PyObject *exc = NULL;
5095 /* the following variable is used for caching string comparisons
5096 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5097 * 3=ignore, 4=xmlcharrefreplace */
5098 int known_errorHandler = -1;
5099
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 if (mapping == NULL) {
5101 PyErr_BadArgument();
5102 return NULL;
5103 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104
5105 /* allocate enough for a simple 1:1 translation without
5106 replacements, if we need more, we'll resize */
5107 res = PyUnicode_FromUnicode(NULL, size);
5108 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 return res;
5112 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 while (p<endp) {
5115 /* try to encode it */
5116 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005117 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 goto onError;
5120 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005121 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 if (x!=Py_None) /* it worked => adjust input pointer */
5123 ++p;
5124 else { /* untranslatable character */
5125 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005126 Py_ssize_t repsize;
5127 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 Py_UNICODE *uni2;
5129 /* startpos for collecting untranslatable chars */
5130 const Py_UNICODE *collstart = p;
5131 const Py_UNICODE *collend = p+1;
5132 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 /* find all untranslatable characters */
5135 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005136 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 goto onError;
5138 Py_XDECREF(x);
5139 if (x!=Py_None)
5140 break;
5141 ++collend;
5142 }
5143 /* cache callback name lookup
5144 * (if not done yet, i.e. it's the first error) */
5145 if (known_errorHandler==-1) {
5146 if ((errors==NULL) || (!strcmp(errors, "strict")))
5147 known_errorHandler = 1;
5148 else if (!strcmp(errors, "replace"))
5149 known_errorHandler = 2;
5150 else if (!strcmp(errors, "ignore"))
5151 known_errorHandler = 3;
5152 else if (!strcmp(errors, "xmlcharrefreplace"))
5153 known_errorHandler = 4;
5154 else
5155 known_errorHandler = 0;
5156 }
5157 switch (known_errorHandler) {
5158 case 1: /* strict */
5159 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5160 goto onError;
5161 case 2: /* replace */
5162 /* No need to check for space, this is a 1:1 replacement */
5163 for (coll = collstart; coll<collend; ++coll)
5164 *str++ = '?';
5165 /* fall through */
5166 case 3: /* ignore */
5167 p = collend;
5168 break;
5169 case 4: /* xmlcharrefreplace */
5170 /* generate replacement (temporarily (mis)uses p) */
5171 for (p = collstart; p < collend; ++p) {
5172 char buffer[2+29+1+1];
5173 char *cp;
5174 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005175 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5177 goto onError;
5178 for (cp = buffer; *cp; ++cp)
5179 *str++ = *cp;
5180 }
5181 p = collend;
5182 break;
5183 default:
5184 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5185 reason, startp, size, &exc,
5186 collstart-startp, collend-startp, &newpos);
5187 if (repunicode == NULL)
5188 goto onError;
5189 /* generate replacement */
5190 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005191 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5193 Py_DECREF(repunicode);
5194 goto onError;
5195 }
5196 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5197 *str++ = *uni2;
5198 p = startp + newpos;
5199 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 }
5201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 /* Resize if we allocated to much */
5204 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005205 if (respos<PyUnicode_GET_SIZE(res)) {
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005206 if (PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005207 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 }
5209 Py_XDECREF(exc);
5210 Py_XDECREF(errorHandler);
5211 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 onError:
5214 Py_XDECREF(res);
5215 Py_XDECREF(exc);
5216 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 return NULL;
5218}
5219
5220PyObject *PyUnicode_Translate(PyObject *str,
5221 PyObject *mapping,
5222 const char *errors)
5223{
5224 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005225
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 str = PyUnicode_FromObject(str);
5227 if (str == NULL)
5228 goto onError;
5229 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5230 PyUnicode_GET_SIZE(str),
5231 mapping,
5232 errors);
5233 Py_DECREF(str);
5234 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 onError:
5237 Py_XDECREF(str);
5238 return NULL;
5239}
Tim Petersced69f82003-09-16 20:30:58 +00005240
Guido van Rossum9e896b32000-04-05 20:11:21 +00005241/* --- Decimal Encoder ---------------------------------------------------- */
5242
5243int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005245 char *output,
5246 const char *errors)
5247{
5248 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 PyObject *errorHandler = NULL;
5250 PyObject *exc = NULL;
5251 const char *encoding = "decimal";
5252 const char *reason = "invalid decimal Unicode string";
5253 /* the following variable is used for caching string comparisons
5254 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5255 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005256
5257 if (output == NULL) {
5258 PyErr_BadArgument();
5259 return -1;
5260 }
5261
5262 p = s;
5263 end = s + length;
5264 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005266 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005268 Py_ssize_t repsize;
5269 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 Py_UNICODE *uni2;
5271 Py_UNICODE *collstart;
5272 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005273
Guido van Rossum9e896b32000-04-05 20:11:21 +00005274 if (Py_UNICODE_ISSPACE(ch)) {
5275 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005276 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005277 continue;
5278 }
5279 decimal = Py_UNICODE_TODECIMAL(ch);
5280 if (decimal >= 0) {
5281 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005283 continue;
5284 }
Guido van Rossumba477042000-04-06 18:18:10 +00005285 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005286 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005288 continue;
5289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 /* All other characters are considered unencodable */
5291 collstart = p;
5292 collend = p+1;
5293 while (collend < end) {
5294 if ((0 < *collend && *collend < 256) ||
5295 !Py_UNICODE_ISSPACE(*collend) ||
5296 Py_UNICODE_TODECIMAL(*collend))
5297 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005298 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005299 /* cache callback name lookup
5300 * (if not done yet, i.e. it's the first error) */
5301 if (known_errorHandler==-1) {
5302 if ((errors==NULL) || (!strcmp(errors, "strict")))
5303 known_errorHandler = 1;
5304 else if (!strcmp(errors, "replace"))
5305 known_errorHandler = 2;
5306 else if (!strcmp(errors, "ignore"))
5307 known_errorHandler = 3;
5308 else if (!strcmp(errors, "xmlcharrefreplace"))
5309 known_errorHandler = 4;
5310 else
5311 known_errorHandler = 0;
5312 }
5313 switch (known_errorHandler) {
5314 case 1: /* strict */
5315 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5316 goto onError;
5317 case 2: /* replace */
5318 for (p = collstart; p < collend; ++p)
5319 *output++ = '?';
5320 /* fall through */
5321 case 3: /* ignore */
5322 p = collend;
5323 break;
5324 case 4: /* xmlcharrefreplace */
5325 /* generate replacement (temporarily (mis)uses p) */
5326 for (p = collstart; p < collend; ++p)
5327 output += sprintf(output, "&#%d;", (int)*p);
5328 p = collend;
5329 break;
5330 default:
5331 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5332 encoding, reason, s, length, &exc,
5333 collstart-s, collend-s, &newpos);
5334 if (repunicode == NULL)
5335 goto onError;
5336 /* generate replacement */
5337 repsize = PyUnicode_GET_SIZE(repunicode);
5338 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5339 Py_UNICODE ch = *uni2;
5340 if (Py_UNICODE_ISSPACE(ch))
5341 *output++ = ' ';
5342 else {
5343 decimal = Py_UNICODE_TODECIMAL(ch);
5344 if (decimal >= 0)
5345 *output++ = '0' + decimal;
5346 else if (0 < ch && ch < 256)
5347 *output++ = (char)ch;
5348 else {
5349 Py_DECREF(repunicode);
5350 raise_encode_exception(&exc, encoding,
5351 s, length, collstart-s, collend-s, reason);
5352 goto onError;
5353 }
5354 }
5355 }
5356 p = s + newpos;
5357 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005358 }
5359 }
5360 /* 0-terminate the output string */
5361 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362 Py_XDECREF(exc);
5363 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005364 return 0;
5365
5366 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 Py_XDECREF(exc);
5368 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005369 return -1;
5370}
5371
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372/* --- Helpers ------------------------------------------------------------ */
5373
Eric Smith8c663262007-08-25 02:26:07 +00005374#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005375#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005376#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005377/* Include _ParseTupleFinds from find.h */
5378#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005379#include "stringlib/find.h"
5380#include "stringlib/partition.h"
5381
Eric Smith5807c412008-05-11 21:00:57 +00005382#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5383#include "stringlib/localeutil.h"
5384
Thomas Wouters477c8d52006-05-27 19:21:47 +00005385/* helper macro to fixup start/end slice values */
5386#define FIX_START_END(obj) \
5387 if (start < 0) \
5388 start += (obj)->length; \
5389 if (start < 0) \
5390 start = 0; \
5391 if (end > (obj)->length) \
5392 end = (obj)->length; \
5393 if (end < 0) \
5394 end += (obj)->length; \
5395 if (end < 0) \
5396 end = 0;
5397
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005399 PyObject *substr,
5400 Py_ssize_t start,
5401 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005403 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005404 PyUnicodeObject* str_obj;
5405 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005406
Thomas Wouters477c8d52006-05-27 19:21:47 +00005407 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5408 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5411 if (!sub_obj) {
5412 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 return -1;
5414 }
Tim Petersced69f82003-09-16 20:30:58 +00005415
Thomas Wouters477c8d52006-05-27 19:21:47 +00005416 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005417
Thomas Wouters477c8d52006-05-27 19:21:47 +00005418 result = stringlib_count(
5419 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5420 );
5421
5422 Py_DECREF(sub_obj);
5423 Py_DECREF(str_obj);
5424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 return result;
5426}
5427
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005429 PyObject *sub,
5430 Py_ssize_t start,
5431 Py_ssize_t end,
5432 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005434 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005437 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005438 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005439 sub = PyUnicode_FromObject(sub);
5440 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005441 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005442 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 }
Tim Petersced69f82003-09-16 20:30:58 +00005444
Thomas Wouters477c8d52006-05-27 19:21:47 +00005445 if (direction > 0)
5446 result = stringlib_find_slice(
5447 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5448 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5449 start, end
5450 );
5451 else
5452 result = stringlib_rfind_slice(
5453 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5454 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5455 start, end
5456 );
5457
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005459 Py_DECREF(sub);
5460
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 return result;
5462}
5463
Tim Petersced69f82003-09-16 20:30:58 +00005464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465int tailmatch(PyUnicodeObject *self,
5466 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t start,
5468 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 int direction)
5470{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 if (substring->length == 0)
5472 return 1;
5473
Thomas Wouters477c8d52006-05-27 19:21:47 +00005474 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476 end -= substring->length;
5477 if (end < start)
5478 return 0;
5479
5480 if (direction > 0) {
5481 if (Py_UNICODE_MATCH(self, end, substring))
5482 return 1;
5483 } else {
5484 if (Py_UNICODE_MATCH(self, start, substring))
5485 return 1;
5486 }
5487
5488 return 0;
5489}
5490
Martin v. Löwis18e16552006-02-15 17:27:45 +00005491Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005493 Py_ssize_t start,
5494 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 int direction)
5496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005497 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 str = PyUnicode_FromObject(str);
5500 if (str == NULL)
5501 return -1;
5502 substr = PyUnicode_FromObject(substr);
5503 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005504 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 return -1;
5506 }
Tim Petersced69f82003-09-16 20:30:58 +00005507
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 result = tailmatch((PyUnicodeObject *)str,
5509 (PyUnicodeObject *)substr,
5510 start, end, direction);
5511 Py_DECREF(str);
5512 Py_DECREF(substr);
5513 return result;
5514}
5515
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516/* Apply fixfct filter to the Unicode object self and return a
5517 reference to the modified object */
5518
Tim Petersced69f82003-09-16 20:30:58 +00005519static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520PyObject *fixup(PyUnicodeObject *self,
5521 int (*fixfct)(PyUnicodeObject *s))
5522{
5523
5524 PyUnicodeObject *u;
5525
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005526 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (u == NULL)
5528 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005529
5530 Py_UNICODE_COPY(u->str, self->str, self->length);
5531
Tim Peters7a29bd52001-09-12 03:03:31 +00005532 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 /* fixfct should return TRUE if it modified the buffer. If
5534 FALSE, return a reference to the original buffer instead
5535 (to save space, not time) */
5536 Py_INCREF(self);
5537 Py_DECREF(u);
5538 return (PyObject*) self;
5539 }
5540 return (PyObject*) u;
5541}
5542
Tim Petersced69f82003-09-16 20:30:58 +00005543static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544int fixupper(PyUnicodeObject *self)
5545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005546 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 Py_UNICODE *s = self->str;
5548 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 while (len-- > 0) {
5551 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 ch = Py_UNICODE_TOUPPER(*s);
5554 if (ch != *s) {
5555 status = 1;
5556 *s = ch;
5557 }
5558 s++;
5559 }
5560
5561 return status;
5562}
5563
Tim Petersced69f82003-09-16 20:30:58 +00005564static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565int fixlower(PyUnicodeObject *self)
5566{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005567 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 Py_UNICODE *s = self->str;
5569 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005570
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 while (len-- > 0) {
5572 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 ch = Py_UNICODE_TOLOWER(*s);
5575 if (ch != *s) {
5576 status = 1;
5577 *s = ch;
5578 }
5579 s++;
5580 }
5581
5582 return status;
5583}
5584
Tim Petersced69f82003-09-16 20:30:58 +00005585static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586int fixswapcase(PyUnicodeObject *self)
5587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005588 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 Py_UNICODE *s = self->str;
5590 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005591
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 while (len-- > 0) {
5593 if (Py_UNICODE_ISUPPER(*s)) {
5594 *s = Py_UNICODE_TOLOWER(*s);
5595 status = 1;
5596 } else if (Py_UNICODE_ISLOWER(*s)) {
5597 *s = Py_UNICODE_TOUPPER(*s);
5598 status = 1;
5599 }
5600 s++;
5601 }
5602
5603 return status;
5604}
5605
Tim Petersced69f82003-09-16 20:30:58 +00005606static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607int fixcapitalize(PyUnicodeObject *self)
5608{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005609 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005610 Py_UNICODE *s = self->str;
5611 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005612
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005613 if (len == 0)
5614 return 0;
5615 if (Py_UNICODE_ISLOWER(*s)) {
5616 *s = Py_UNICODE_TOUPPER(*s);
5617 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005619 s++;
5620 while (--len > 0) {
5621 if (Py_UNICODE_ISUPPER(*s)) {
5622 *s = Py_UNICODE_TOLOWER(*s);
5623 status = 1;
5624 }
5625 s++;
5626 }
5627 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628}
5629
5630static
5631int fixtitle(PyUnicodeObject *self)
5632{
5633 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5634 register Py_UNICODE *e;
5635 int previous_is_cased;
5636
5637 /* Shortcut for single character strings */
5638 if (PyUnicode_GET_SIZE(self) == 1) {
5639 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5640 if (*p != ch) {
5641 *p = ch;
5642 return 1;
5643 }
5644 else
5645 return 0;
5646 }
Tim Petersced69f82003-09-16 20:30:58 +00005647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 e = p + PyUnicode_GET_SIZE(self);
5649 previous_is_cased = 0;
5650 for (; p < e; p++) {
5651 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 if (previous_is_cased)
5654 *p = Py_UNICODE_TOLOWER(ch);
5655 else
5656 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005657
5658 if (Py_UNICODE_ISLOWER(ch) ||
5659 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 Py_UNICODE_ISTITLE(ch))
5661 previous_is_cased = 1;
5662 else
5663 previous_is_cased = 0;
5664 }
5665 return 1;
5666}
5667
Tim Peters8ce9f162004-08-27 01:49:32 +00005668PyObject *
5669PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670{
Skip Montanaro6543b452004-09-16 03:28:13 +00005671 const Py_UNICODE blank = ' ';
5672 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005673 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005675 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5676 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005677 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5678 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005680 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Tim Peters05eba1f2004-08-27 21:32:02 +00005682 fseq = PySequence_Fast(seq, "");
5683 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 }
5686
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005687 /* NOTE: the following code can't call back into Python code,
5688 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005689 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005690
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 seqlen = PySequence_Fast_GET_SIZE(fseq);
5692 /* If empty sequence, return u"". */
5693 if (seqlen == 0) {
5694 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5695 goto Done;
5696 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005697 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005698 /* If singleton sequence with an exact Unicode, return that. */
5699 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005700 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005701 if (PyUnicode_CheckExact(item)) {
5702 Py_INCREF(item);
5703 res = (PyUnicodeObject *)item;
5704 goto Done;
5705 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005706 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005707 else {
5708 /* Set up sep and seplen */
5709 if (separator == NULL) {
5710 sep = &blank;
5711 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005712 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005713 else {
5714 if (!PyUnicode_Check(separator)) {
5715 PyErr_Format(PyExc_TypeError,
5716 "separator: expected str instance,"
5717 " %.80s found",
5718 Py_TYPE(separator)->tp_name);
5719 goto onError;
5720 }
5721 sep = PyUnicode_AS_UNICODE(separator);
5722 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005723 }
5724 }
5725
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005726 /* There are at least two things to join, or else we have a subclass
5727 * of str in the sequence.
5728 * Do a pre-pass to figure out the total amount of space we'll
5729 * need (sz), and see whether all argument are strings.
5730 */
5731 sz = 0;
5732 for (i = 0; i < seqlen; i++) {
5733 const Py_ssize_t old_sz = sz;
5734 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005735 if (!PyUnicode_Check(item)) {
5736 PyErr_Format(PyExc_TypeError,
5737 "sequence item %zd: expected str instance,"
5738 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005739 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005740 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005741 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005742 sz += PyUnicode_GET_SIZE(item);
5743 if (i != 0)
5744 sz += seplen;
5745 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5746 PyErr_SetString(PyExc_OverflowError,
5747 "join() result is too long for a Python string");
5748 goto onError;
5749 }
5750 }
Tim Petersced69f82003-09-16 20:30:58 +00005751
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005752 res = _PyUnicode_New(sz);
5753 if (res == NULL)
5754 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005755
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005756 /* Catenate everything. */
5757 res_p = PyUnicode_AS_UNICODE(res);
5758 for (i = 0; i < seqlen; ++i) {
5759 Py_ssize_t itemlen;
5760 item = items[i];
5761 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005762 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005763 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005764 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005765 res_p += seplen;
5766 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005767 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5768 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005769 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005770
Tim Peters8ce9f162004-08-27 01:49:32 +00005771 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005772 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 return (PyObject *)res;
5774
5775 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005776 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005777 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
5782PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005783 Py_ssize_t left,
5784 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 Py_UNICODE fill)
5786{
5787 PyUnicodeObject *u;
5788
5789 if (left < 0)
5790 left = 0;
5791 if (right < 0)
5792 right = 0;
5793
Tim Peters7a29bd52001-09-12 03:03:31 +00005794 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 Py_INCREF(self);
5796 return self;
5797 }
5798
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005799 if (left > PY_SSIZE_T_MAX - self->length ||
5800 right > PY_SSIZE_T_MAX - (left + self->length)) {
5801 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5802 return NULL;
5803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 u = _PyUnicode_New(left + self->length + right);
5805 if (u) {
5806 if (left)
5807 Py_UNICODE_FILL(u->str, fill, left);
5808 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5809 if (right)
5810 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5811 }
5812
5813 return u;
5814}
5815
5816#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 if (!str) \
5819 goto onError; \
5820 if (PyList_Append(list, str)) { \
5821 Py_DECREF(str); \
5822 goto onError; \
5823 } \
5824 else \
5825 Py_DECREF(str);
5826
5827static
5828PyObject *split_whitespace(PyUnicodeObject *self,
5829 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005830 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 register Py_ssize_t i;
5833 register Py_ssize_t j;
5834 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005836 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837
5838 for (i = j = 0; i < len; ) {
5839 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005840 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 i++;
5842 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005843 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 i++;
5845 if (j < i) {
5846 if (maxcount-- <= 0)
5847 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005848 SPLIT_APPEND(buf, j, i);
5849 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 i++;
5851 j = i;
5852 }
5853 }
5854 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005855 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 }
5857 return list;
5858
5859 onError:
5860 Py_DECREF(list);
5861 return NULL;
5862}
5863
5864PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005865 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 register Py_ssize_t i;
5868 register Py_ssize_t j;
5869 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 PyObject *list;
5871 PyObject *str;
5872 Py_UNICODE *data;
5873
5874 string = PyUnicode_FromObject(string);
5875 if (string == NULL)
5876 return NULL;
5877 data = PyUnicode_AS_UNICODE(string);
5878 len = PyUnicode_GET_SIZE(string);
5879
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 list = PyList_New(0);
5881 if (!list)
5882 goto onError;
5883
5884 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005885 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005886
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
5891 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005892 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 if (i < len) {
5894 if (data[i] == '\r' && i + 1 < len &&
5895 data[i+1] == '\n')
5896 i += 2;
5897 else
5898 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005899 if (keepends)
5900 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 }
Guido van Rossum86662912000-04-11 15:38:46 +00005902 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 j = i;
5904 }
5905 if (j < len) {
5906 SPLIT_APPEND(data, j, len);
5907 }
5908
5909 Py_DECREF(string);
5910 return list;
5911
5912 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005913 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 Py_DECREF(string);
5915 return NULL;
5916}
5917
Tim Petersced69f82003-09-16 20:30:58 +00005918static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919PyObject *split_char(PyUnicodeObject *self,
5920 PyObject *list,
5921 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005924 register Py_ssize_t i;
5925 register Py_ssize_t j;
5926 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005928 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
5930 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005931 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 if (maxcount-- <= 0)
5933 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005934 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 i = j = i + 1;
5936 } else
5937 i++;
5938 }
5939 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005940 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
5942 return list;
5943
5944 onError:
5945 Py_DECREF(list);
5946 return NULL;
5947}
5948
Tim Petersced69f82003-09-16 20:30:58 +00005949static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950PyObject *split_substring(PyUnicodeObject *self,
5951 PyObject *list,
5952 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005953 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005955 register Py_ssize_t i;
5956 register Py_ssize_t j;
5957 Py_ssize_t len = self->length;
5958 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 PyObject *str;
5960
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005961 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 if (Py_UNICODE_MATCH(self, i, substring)) {
5963 if (maxcount-- <= 0)
5964 break;
5965 SPLIT_APPEND(self->str, j, i);
5966 i = j = i + sublen;
5967 } else
5968 i++;
5969 }
5970 if (j <= len) {
5971 SPLIT_APPEND(self->str, j, len);
5972 }
5973 return list;
5974
5975 onError:
5976 Py_DECREF(list);
5977 return NULL;
5978}
5979
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005980static
5981PyObject *rsplit_whitespace(PyUnicodeObject *self,
5982 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005983 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005985 register Py_ssize_t i;
5986 register Py_ssize_t j;
5987 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005988 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005989 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005990
5991 for (i = j = len - 1; i >= 0; ) {
5992 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005993 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994 i--;
5995 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005996 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005997 i--;
5998 if (j > i) {
5999 if (maxcount-- <= 0)
6000 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006001 SPLIT_APPEND(buf, i + 1, j + 1);
6002 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006003 i--;
6004 j = i;
6005 }
6006 }
6007 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006008 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006009 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 if (PyList_Reverse(list) < 0)
6011 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012 return list;
6013
6014 onError:
6015 Py_DECREF(list);
6016 return NULL;
6017}
6018
6019static
6020PyObject *rsplit_char(PyUnicodeObject *self,
6021 PyObject *list,
6022 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006024{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 register Py_ssize_t i;
6026 register Py_ssize_t j;
6027 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006028 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006029 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006030
6031 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006032 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006033 if (maxcount-- <= 0)
6034 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006035 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006036 j = i = i - 1;
6037 } else
6038 i--;
6039 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006040 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006041 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006042 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043 if (PyList_Reverse(list) < 0)
6044 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006045 return list;
6046
6047 onError:
6048 Py_DECREF(list);
6049 return NULL;
6050}
6051
6052static
6053PyObject *rsplit_substring(PyUnicodeObject *self,
6054 PyObject *list,
6055 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006057{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006058 register Py_ssize_t i;
6059 register Py_ssize_t j;
6060 Py_ssize_t len = self->length;
6061 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006062 PyObject *str;
6063
6064 for (i = len - sublen, j = len; i >= 0; ) {
6065 if (Py_UNICODE_MATCH(self, i, substring)) {
6066 if (maxcount-- <= 0)
6067 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006068 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006069 j = i;
6070 i -= sublen;
6071 } else
6072 i--;
6073 }
6074 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006075 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006076 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006077 if (PyList_Reverse(list) < 0)
6078 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006079 return list;
6080
6081 onError:
6082 Py_DECREF(list);
6083 return NULL;
6084}
6085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086#undef SPLIT_APPEND
6087
6088static
6089PyObject *split(PyUnicodeObject *self,
6090 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006091 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
6093 PyObject *list;
6094
6095 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006096 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098 list = PyList_New(0);
6099 if (!list)
6100 return NULL;
6101
6102 if (substring == NULL)
6103 return split_whitespace(self,list,maxcount);
6104
6105 else if (substring->length == 1)
6106 return split_char(self,list,substring->str[0],maxcount);
6107
6108 else if (substring->length == 0) {
6109 Py_DECREF(list);
6110 PyErr_SetString(PyExc_ValueError, "empty separator");
6111 return NULL;
6112 }
6113 else
6114 return split_substring(self,list,substring,maxcount);
6115}
6116
Tim Petersced69f82003-09-16 20:30:58 +00006117static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006118PyObject *rsplit(PyUnicodeObject *self,
6119 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006121{
6122 PyObject *list;
6123
6124 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006125 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006126
6127 list = PyList_New(0);
6128 if (!list)
6129 return NULL;
6130
6131 if (substring == NULL)
6132 return rsplit_whitespace(self,list,maxcount);
6133
6134 else if (substring->length == 1)
6135 return rsplit_char(self,list,substring->str[0],maxcount);
6136
6137 else if (substring->length == 0) {
6138 Py_DECREF(list);
6139 PyErr_SetString(PyExc_ValueError, "empty separator");
6140 return NULL;
6141 }
6142 else
6143 return rsplit_substring(self,list,substring,maxcount);
6144}
6145
6146static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147PyObject *replace(PyUnicodeObject *self,
6148 PyUnicodeObject *str1,
6149 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006150 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
6152 PyUnicodeObject *u;
6153
6154 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006155 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Thomas Wouters477c8d52006-05-27 19:21:47 +00006157 if (str1->length == str2->length) {
6158 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006159 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006160 if (str1->length == 1) {
6161 /* replace characters */
6162 Py_UNICODE u1, u2;
6163 if (!findchar(self->str, self->length, str1->str[0]))
6164 goto nothing;
6165 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6166 if (!u)
6167 return NULL;
6168 Py_UNICODE_COPY(u->str, self->str, self->length);
6169 u1 = str1->str[0];
6170 u2 = str2->str[0];
6171 for (i = 0; i < u->length; i++)
6172 if (u->str[i] == u1) {
6173 if (--maxcount < 0)
6174 break;
6175 u->str[i] = u2;
6176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006178 i = fastsearch(
6179 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006181 if (i < 0)
6182 goto nothing;
6183 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6184 if (!u)
6185 return NULL;
6186 Py_UNICODE_COPY(u->str, self->str, self->length);
6187 while (i <= self->length - str1->length)
6188 if (Py_UNICODE_MATCH(self, i, str1)) {
6189 if (--maxcount < 0)
6190 break;
6191 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6192 i += str1->length;
6193 } else
6194 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006197
6198 Py_ssize_t n, i, j, e;
6199 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 Py_UNICODE *p;
6201
6202 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006203 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 if (n > maxcount)
6205 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006206 if (n == 0)
6207 goto nothing;
6208 /* new_size = self->length + n * (str2->length - str1->length)); */
6209 delta = (str2->length - str1->length);
6210 if (delta == 0) {
6211 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006213 product = n * (str2->length - str1->length);
6214 if ((product / (str2->length - str1->length)) != n) {
6215 PyErr_SetString(PyExc_OverflowError,
6216 "replace string is too long");
6217 return NULL;
6218 }
6219 new_size = self->length + product;
6220 if (new_size < 0) {
6221 PyErr_SetString(PyExc_OverflowError,
6222 "replace string is too long");
6223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 }
6225 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006226 u = _PyUnicode_New(new_size);
6227 if (!u)
6228 return NULL;
6229 i = 0;
6230 p = u->str;
6231 e = self->length - str1->length;
6232 if (str1->length > 0) {
6233 while (n-- > 0) {
6234 /* look for next match */
6235 j = i;
6236 while (j <= e) {
6237 if (Py_UNICODE_MATCH(self, j, str1))
6238 break;
6239 j++;
6240 }
6241 if (j > i) {
6242 if (j > e)
6243 break;
6244 /* copy unchanged part [i:j] */
6245 Py_UNICODE_COPY(p, self->str+i, j-i);
6246 p += j - i;
6247 }
6248 /* copy substitution string */
6249 if (str2->length > 0) {
6250 Py_UNICODE_COPY(p, str2->str, str2->length);
6251 p += str2->length;
6252 }
6253 i = j + str1->length;
6254 }
6255 if (i < self->length)
6256 /* copy tail [i:] */
6257 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6258 } else {
6259 /* interleave */
6260 while (n > 0) {
6261 Py_UNICODE_COPY(p, str2->str, str2->length);
6262 p += str2->length;
6263 if (--n <= 0)
6264 break;
6265 *p++ = self->str[i++];
6266 }
6267 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006271
6272nothing:
6273 /* nothing to replace; return original string (when possible) */
6274 if (PyUnicode_CheckExact(self)) {
6275 Py_INCREF(self);
6276 return (PyObject *) self;
6277 }
6278 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279}
6280
6281/* --- Unicode Object Methods --------------------------------------------- */
6282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006283PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006284"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285\n\
6286Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006287characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
6289static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006290unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 return fixup(self, fixtitle);
6293}
6294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006295PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006296"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297\n\
6298Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006299have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
6301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006302unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 return fixup(self, fixcapitalize);
6305}
6306
6307#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006309"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310\n\
6311Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006312normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006315unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316{
6317 PyObject *list;
6318 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 /* Split into words */
6322 list = split(self, NULL, -1);
6323 if (!list)
6324 return NULL;
6325
6326 /* Capitalize each word */
6327 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6328 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6329 fixcapitalize);
6330 if (item == NULL)
6331 goto onError;
6332 Py_DECREF(PyList_GET_ITEM(list, i));
6333 PyList_SET_ITEM(list, i, item);
6334 }
6335
6336 /* Join the words to form a new string */
6337 item = PyUnicode_Join(NULL, list);
6338
6339onError:
6340 Py_DECREF(list);
6341 return (PyObject *)item;
6342}
6343#endif
6344
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006345/* Argument converter. Coerces to a single unicode character */
6346
6347static int
6348convert_uc(PyObject *obj, void *addr)
6349{
6350 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6351 PyObject *uniobj;
6352 Py_UNICODE *unistr;
6353
6354 uniobj = PyUnicode_FromObject(obj);
6355 if (uniobj == NULL) {
6356 PyErr_SetString(PyExc_TypeError,
6357 "The fill character cannot be converted to Unicode");
6358 return 0;
6359 }
6360 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6361 PyErr_SetString(PyExc_TypeError,
6362 "The fill character must be exactly one character long");
6363 Py_DECREF(uniobj);
6364 return 0;
6365 }
6366 unistr = PyUnicode_AS_UNICODE(uniobj);
6367 *fillcharloc = unistr[0];
6368 Py_DECREF(uniobj);
6369 return 1;
6370}
6371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006372PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006373"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006375Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006376done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377
6378static PyObject *
6379unicode_center(PyUnicodeObject *self, PyObject *args)
6380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006381 Py_ssize_t marg, left;
6382 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006383 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
Thomas Woutersde017742006-02-16 19:34:37 +00006385 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 return NULL;
6387
Tim Peters7a29bd52001-09-12 03:03:31 +00006388 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 Py_INCREF(self);
6390 return (PyObject*) self;
6391 }
6392
6393 marg = width - self->length;
6394 left = marg / 2 + (marg & width & 1);
6395
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006396 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397}
6398
Marc-André Lemburge5034372000-08-08 08:04:29 +00006399#if 0
6400
6401/* This code should go into some future Unicode collation support
6402 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006403 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006404
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006405/* speedy UTF-16 code point order comparison */
6406/* gleaned from: */
6407/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6408
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006409static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006410{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006411 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006412 0, 0, 0, 0, 0, 0, 0, 0,
6413 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006414 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006415};
6416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417static int
6418unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006420 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 Py_UNICODE *s1 = str1->str;
6423 Py_UNICODE *s2 = str2->str;
6424
6425 len1 = str1->length;
6426 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006427
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006429 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006430
6431 c1 = *s1++;
6432 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006433
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006434 if (c1 > (1<<11) * 26)
6435 c1 += utf16Fixup[c1>>11];
6436 if (c2 > (1<<11) * 26)
6437 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006438 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006439
6440 if (c1 != c2)
6441 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006442
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006443 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
6445
6446 return (len1 < len2) ? -1 : (len1 != len2);
6447}
6448
Marc-André Lemburge5034372000-08-08 08:04:29 +00006449#else
6450
6451static int
6452unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006454 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006455
6456 Py_UNICODE *s1 = str1->str;
6457 Py_UNICODE *s2 = str2->str;
6458
6459 len1 = str1->length;
6460 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006461
Marc-André Lemburge5034372000-08-08 08:04:29 +00006462 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006463 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006464
Fredrik Lundh45714e92001-06-26 16:39:36 +00006465 c1 = *s1++;
6466 c2 = *s2++;
6467
6468 if (c1 != c2)
6469 return (c1 < c2) ? -1 : 1;
6470
Marc-André Lemburge5034372000-08-08 08:04:29 +00006471 len1--; len2--;
6472 }
6473
6474 return (len1 < len2) ? -1 : (len1 != len2);
6475}
6476
6477#endif
6478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479int PyUnicode_Compare(PyObject *left,
6480 PyObject *right)
6481{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006482 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6483 return unicode_compare((PyUnicodeObject *)left,
6484 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006485 PyErr_Format(PyExc_TypeError,
6486 "Can't compare %.100s and %.100s",
6487 left->ob_type->tp_name,
6488 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 return -1;
6490}
6491
Martin v. Löwis5b222132007-06-10 09:51:05 +00006492int
6493PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6494{
6495 int i;
6496 Py_UNICODE *id;
6497 assert(PyUnicode_Check(uni));
6498 id = PyUnicode_AS_UNICODE(uni);
6499 /* Compare Unicode string and source character set string */
6500 for (i = 0; id[i] && str[i]; i++)
6501 if (id[i] != str[i])
6502 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6503 if (id[i])
6504 return 1; /* uni is longer */
6505 if (str[i])
6506 return -1; /* str is longer */
6507 return 0;
6508}
6509
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006510
6511#define TEST_COND(cond) \
6512 ((cond) ? Py_True : Py_False)
6513
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006514PyObject *PyUnicode_RichCompare(PyObject *left,
6515 PyObject *right,
6516 int op)
6517{
6518 int result;
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006519
6520 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6521 PyObject *v;
6522 if (((PyUnicodeObject *) left)->length !=
6523 ((PyUnicodeObject *) right)->length) {
6524 if (op == Py_EQ) {
6525 Py_INCREF(Py_False);
6526 return Py_False;
6527 }
6528 if (op == Py_NE) {
6529 Py_INCREF(Py_True);
6530 return Py_True;
6531 }
6532 }
6533 if (left == right)
6534 result = 0;
6535 else
6536 result = unicode_compare((PyUnicodeObject *)left,
6537 (PyUnicodeObject *)right);
6538
6539 /* Convert the return value to a Boolean */
6540 switch (op) {
6541 case Py_EQ:
6542 v = TEST_COND(result == 0);
6543 break;
6544 case Py_NE:
6545 v = TEST_COND(result != 0);
6546 break;
6547 case Py_LE:
6548 v = TEST_COND(result <= 0);
6549 break;
6550 case Py_GE:
6551 v = TEST_COND(result >= 0);
6552 break;
6553 case Py_LT:
6554 v = TEST_COND(result == -1);
6555 break;
6556 case Py_GT:
6557 v = TEST_COND(result == 1);
6558 break;
6559 default:
6560 PyErr_BadArgument();
6561 return NULL;
6562 }
6563 Py_INCREF(v);
6564 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006565 }
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006566
6567 Py_INCREF(Py_NotImplemented);
6568 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006569}
6570
Guido van Rossum403d68b2000-03-13 15:55:09 +00006571int PyUnicode_Contains(PyObject *container,
6572 PyObject *element)
6573{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006574 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006576
6577 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 sub = PyUnicode_FromObject(element);
6579 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006580 PyErr_Format(PyExc_TypeError,
6581 "'in <string>' requires string as left operand, not %s",
6582 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006584 }
6585
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 str = PyUnicode_FromObject(container);
6587 if (!str) {
6588 Py_DECREF(sub);
6589 return -1;
6590 }
6591
6592 result = stringlib_contains_obj(str, sub);
6593
6594 Py_DECREF(str);
6595 Py_DECREF(sub);
6596
Guido van Rossum403d68b2000-03-13 15:55:09 +00006597 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006598}
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600/* Concat to string or Unicode object giving a new Unicode object. */
6601
6602PyObject *PyUnicode_Concat(PyObject *left,
6603 PyObject *right)
6604{
6605 PyUnicodeObject *u = NULL, *v = NULL, *w;
6606
6607 /* Coerce the two arguments */
6608 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6609 if (u == NULL)
6610 goto onError;
6611 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6612 if (v == NULL)
6613 goto onError;
6614
6615 /* Shortcuts */
6616 if (v == unicode_empty) {
6617 Py_DECREF(v);
6618 return (PyObject *)u;
6619 }
6620 if (u == unicode_empty) {
6621 Py_DECREF(u);
6622 return (PyObject *)v;
6623 }
6624
6625 /* Concat the two Unicode strings */
6626 w = _PyUnicode_New(u->length + v->length);
6627 if (w == NULL)
6628 goto onError;
6629 Py_UNICODE_COPY(w->str, u->str, u->length);
6630 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6631
6632 Py_DECREF(u);
6633 Py_DECREF(v);
6634 return (PyObject *)w;
6635
6636onError:
6637 Py_XDECREF(u);
6638 Py_XDECREF(v);
6639 return NULL;
6640}
6641
Walter Dörwald1ab83302007-05-18 17:15:44 +00006642void
6643PyUnicode_Append(PyObject **pleft, PyObject *right)
6644{
6645 PyObject *new;
6646 if (*pleft == NULL)
6647 return;
6648 if (right == NULL || !PyUnicode_Check(*pleft)) {
6649 Py_DECREF(*pleft);
6650 *pleft = NULL;
6651 return;
6652 }
6653 new = PyUnicode_Concat(*pleft, right);
6654 Py_DECREF(*pleft);
6655 *pleft = new;
6656}
6657
6658void
6659PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6660{
6661 PyUnicode_Append(pleft, right);
6662 Py_XDECREF(right);
6663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666"S.count(sub[, start[, end]]) -> int\n\
6667\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006669string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject *
6673unicode_count(PyUnicodeObject *self, PyObject *args)
6674{
6675 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006676 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006677 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 PyObject *result;
6679
Guido van Rossumb8872e62000-05-09 14:14:27 +00006680 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6681 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 return NULL;
6683
6684 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 if (substring == NULL)
6687 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006688
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Christian Heimes217cfd12007-12-02 14:31:20 +00006691 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692 stringlib_count(self->str + start, end - start,
6693 substring->str, substring->length)
6694 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695
6696 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 return result;
6699}
6700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006701PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006702"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006704Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006705to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6708'xmlcharrefreplace' as well as any other name registered with\n\
6709codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710
6711static PyObject *
6712unicode_encode(PyUnicodeObject *self, PyObject *args)
6713{
6714 char *encoding = NULL;
6715 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006716 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6719 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006720 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006721 if (v == NULL)
6722 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006723 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006724 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006725 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006726 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006727 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006728 Py_DECREF(v);
6729 return NULL;
6730 }
6731 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006732
6733 onError:
6734 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006735}
6736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006738"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739\n\
6740Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
6743static PyObject*
6744unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6745{
6746 Py_UNICODE *e;
6747 Py_UNICODE *p;
6748 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006749 Py_UNICODE *qe;
6750 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 PyUnicodeObject *u;
6752 int tabsize = 8;
6753
6754 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6755 return NULL;
6756
Thomas Wouters7e474022000-07-16 12:04:32 +00006757 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006758 i = 0; /* chars up to and including most recent \n or \r */
6759 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6760 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 for (p = self->str; p < e; p++)
6762 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006763 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006764 incr = tabsize - (j % tabsize); /* cannot overflow */
6765 if (j > PY_SSIZE_T_MAX - incr)
6766 goto overflow1;
6767 j += incr;
6768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 }
6770 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006771 if (j > PY_SSIZE_T_MAX - 1)
6772 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 j++;
6774 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006775 if (i > PY_SSIZE_T_MAX - j)
6776 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006778 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 }
6780 }
6781
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006782 if (i > PY_SSIZE_T_MAX - j)
6783 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 /* Second pass: create output string and fill it */
6786 u = _PyUnicode_New(i + j);
6787 if (!u)
6788 return NULL;
6789
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006790 j = 0; /* same as in first pass */
6791 q = u->str; /* next output char */
6792 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
6794 for (p = self->str; p < e; p++)
6795 if (*p == '\t') {
6796 if (tabsize > 0) {
6797 i = tabsize - (j % tabsize);
6798 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006799 while (i--) {
6800 if (q >= qe)
6801 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 }
6805 }
6806 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006807 if (q >= qe)
6808 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006810 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 if (*p == '\n' || *p == '\r')
6812 j = 0;
6813 }
6814
6815 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006816
6817 overflow2:
6818 Py_DECREF(u);
6819 overflow1:
6820 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006825"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826\n\
6827Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006828such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829arguments start and end are interpreted as in slice notation.\n\
6830\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
6833static PyObject *
6834unicode_find(PyUnicodeObject *self, PyObject *args)
6835{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006836 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006837 Py_ssize_t start;
6838 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006839 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
Christian Heimes9cd17752007-11-18 19:35:23 +00006841 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Thomas Wouters477c8d52006-05-27 19:21:47 +00006844 result = stringlib_find_slice(
6845 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6846 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6847 start, end
6848 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
6850 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006851
Christian Heimes217cfd12007-12-02 14:31:20 +00006852 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
6855static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006856unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
6858 if (index < 0 || index >= self->length) {
6859 PyErr_SetString(PyExc_IndexError, "string index out of range");
6860 return NULL;
6861 }
6862
6863 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6864}
6865
Guido van Rossumc2504932007-09-18 19:42:40 +00006866/* Believe it or not, this produces the same value for ASCII strings
6867 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006869unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870{
Guido van Rossumc2504932007-09-18 19:42:40 +00006871 Py_ssize_t len;
6872 Py_UNICODE *p;
6873 long x;
6874
6875 if (self->hash != -1)
6876 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006877 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006878 p = self->str;
6879 x = *p << 7;
6880 while (--len >= 0)
6881 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006882 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006883 if (x == -1)
6884 x = -2;
6885 self->hash = x;
6886 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887}
6888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006889PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006890"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
6894static PyObject *
6895unicode_index(PyUnicodeObject *self, PyObject *args)
6896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006897 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006898 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006899 Py_ssize_t start;
6900 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
Christian Heimes9cd17752007-11-18 19:35:23 +00006902 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
Thomas Wouters477c8d52006-05-27 19:21:47 +00006905 result = stringlib_find_slice(
6906 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6907 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6908 start, end
6909 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
6911 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 if (result < 0) {
6914 PyErr_SetString(PyExc_ValueError, "substring not found");
6915 return NULL;
6916 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006917
Christian Heimes217cfd12007-12-02 14:31:20 +00006918 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919}
6920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006928unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
6930 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6931 register const Py_UNICODE *e;
6932 int cased;
6933
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 /* Shortcut for single character strings */
6935 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006938 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006939 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 e = p + PyUnicode_GET_SIZE(self);
6943 cased = 0;
6944 for (; p < e; p++) {
6945 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 else if (!cased && Py_UNICODE_ISLOWER(ch))
6950 cased = 1;
6951 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953}
6954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006958Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
6961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006962unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
6964 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6965 register const Py_UNICODE *e;
6966 int cased;
6967
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 /* Shortcut for single character strings */
6969 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006972 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006973 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 e = p + PyUnicode_GET_SIZE(self);
6977 cased = 0;
6978 for (; p < e; p++) {
6979 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 else if (!cased && Py_UNICODE_ISUPPER(ch))
6984 cased = 1;
6985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006992Return True if S is a titlecased string and there is at least one\n\
6993character in S, i.e. upper- and titlecase characters may only\n\
6994follow uncased characters and lowercase characters only cased ones.\n\
6995Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
7000 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7001 register const Py_UNICODE *e;
7002 int cased, previous_is_cased;
7003
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 /* Shortcut for single character strings */
7005 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007006 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7007 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007009 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007010 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007011 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007012
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 e = p + PyUnicode_GET_SIZE(self);
7014 cased = 0;
7015 previous_is_cased = 0;
7016 for (; p < e; p++) {
7017 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007018
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7020 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007021 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 previous_is_cased = 1;
7023 cased = 1;
7024 }
7025 else if (Py_UNICODE_ISLOWER(ch)) {
7026 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007027 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 previous_is_cased = 1;
7029 cased = 1;
7030 }
7031 else
7032 previous_is_cased = 0;
7033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007034 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007037PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007040Return True if all characters in S are whitespace\n\
7041and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
7043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007044unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
7046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7047 register const Py_UNICODE *e;
7048
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 /* Shortcut for single character strings */
7050 if (PyUnicode_GET_SIZE(self) == 1 &&
7051 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007054 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007055 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007056 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 e = p + PyUnicode_GET_SIZE(self);
7059 for (; p < e; p++) {
7060 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007061 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007063 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007067"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007068\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007069Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007070and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007071
7072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007073unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007074{
7075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7076 register const Py_UNICODE *e;
7077
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007078 /* Shortcut for single character strings */
7079 if (PyUnicode_GET_SIZE(self) == 1 &&
7080 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007081 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007082
7083 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007084 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007085 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007086
7087 e = p + PyUnicode_GET_SIZE(self);
7088 for (; p < e; p++) {
7089 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007090 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007092 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007093}
7094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007095PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007096"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007097\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007098Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007099and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007100
7101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007102unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007103{
7104 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7105 register const Py_UNICODE *e;
7106
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007107 /* Shortcut for single character strings */
7108 if (PyUnicode_GET_SIZE(self) == 1 &&
7109 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007110 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007111
7112 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007113 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007114 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007115
7116 e = p + PyUnicode_GET_SIZE(self);
7117 for (; p < e; p++) {
7118 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007119 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007121 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007122}
7123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007124PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007127Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007131unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
7133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7134 register const Py_UNICODE *e;
7135
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 /* Shortcut for single character strings */
7137 if (PyUnicode_GET_SIZE(self) == 1 &&
7138 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007141 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007142 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007144
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 e = p + PyUnicode_GET_SIZE(self);
7146 for (; p < e; p++) {
7147 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151}
7152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007153PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007154"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007156Return True if all characters in S are digits\n\
7157and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
7159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007160unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161{
7162 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7163 register const Py_UNICODE *e;
7164
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 /* Shortcut for single character strings */
7166 if (PyUnicode_GET_SIZE(self) == 1 &&
7167 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007168 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007170 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007171 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007173
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 e = p + PyUnicode_GET_SIZE(self);
7175 for (; p < e; p++) {
7176 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180}
7181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007183"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007185Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007186False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187
7188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007189unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190{
7191 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7192 register const Py_UNICODE *e;
7193
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 /* Shortcut for single character strings */
7195 if (PyUnicode_GET_SIZE(self) == 1 &&
7196 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007197 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007199 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007200 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007201 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007202
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 e = p + PyUnicode_GET_SIZE(self);
7204 for (; p < e; p++) {
7205 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007206 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007208 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209}
7210
Martin v. Löwis47383402007-08-15 07:32:56 +00007211int
7212PyUnicode_IsIdentifier(PyObject *self)
7213{
7214 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7215 register const Py_UNICODE *e;
7216
7217 /* Special case for empty strings */
7218 if (PyUnicode_GET_SIZE(self) == 0)
7219 return 0;
7220
7221 /* PEP 3131 says that the first character must be in
7222 XID_Start and subsequent characters in XID_Continue,
7223 and for the ASCII range, the 2.x rules apply (i.e
7224 start with letters and underscore, continue with
7225 letters, digits, underscore). However, given the current
7226 definition of XID_Start and XID_Continue, it is sufficient
7227 to check just for these, except that _ must be allowed
7228 as starting an identifier. */
7229 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7230 return 0;
7231
7232 e = p + PyUnicode_GET_SIZE(self);
7233 for (p++; p < e; p++) {
7234 if (!_PyUnicode_IsXidContinue(*p))
7235 return 0;
7236 }
7237 return 1;
7238}
7239
7240PyDoc_STRVAR(isidentifier__doc__,
7241"S.isidentifier() -> bool\n\
7242\n\
7243Return True if S is a valid identifier according\n\
7244to the language definition.");
7245
7246static PyObject*
7247unicode_isidentifier(PyObject *self)
7248{
7249 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7250}
7251
Georg Brandl559e5d72008-06-11 18:37:52 +00007252PyDoc_STRVAR(isprintable__doc__,
7253"S.isprintable() -> bool\n\
7254\n\
7255Return True if all characters in S are considered\n\
7256printable in repr() or S is empty, False otherwise.");
7257
7258static PyObject*
7259unicode_isprintable(PyObject *self)
7260{
7261 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7262 register const Py_UNICODE *e;
7263
7264 /* Shortcut for single character strings */
7265 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7266 Py_RETURN_TRUE;
7267 }
7268
7269 e = p + PyUnicode_GET_SIZE(self);
7270 for (; p < e; p++) {
7271 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7272 Py_RETURN_FALSE;
7273 }
7274 }
7275 Py_RETURN_TRUE;
7276}
7277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007279"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280\n\
7281Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007282sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007285unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007287 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291unicode_length(PyUnicodeObject *self)
7292{
7293 return self->length;
7294}
7295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007297"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007299Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007300done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302static PyObject *
7303unicode_ljust(PyUnicodeObject *self, PyObject *args)
7304{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007305 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007306 Py_UNICODE fillchar = ' ';
7307
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007308 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 return NULL;
7310
Tim Peters7a29bd52001-09-12 03:03:31 +00007311 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 Py_INCREF(self);
7313 return (PyObject*) self;
7314 }
7315
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007316 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317}
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007320"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007325unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 return fixup(self, fixlower);
7328}
7329
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007330#define LEFTSTRIP 0
7331#define RIGHTSTRIP 1
7332#define BOTHSTRIP 2
7333
7334/* Arrays indexed by above */
7335static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7336
7337#define STRIPNAME(i) (stripformat[i]+3)
7338
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007339/* externally visible for str.strip(unicode) */
7340PyObject *
7341_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7342{
7343 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007344 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007345 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007346 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7347 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007348
Thomas Wouters477c8d52006-05-27 19:21:47 +00007349 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7350
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007351 i = 0;
7352 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007353 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7354 i++;
7355 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007356 }
7357
7358 j = len;
7359 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007360 do {
7361 j--;
7362 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7363 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007364 }
7365
7366 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007367 Py_INCREF(self);
7368 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007369 }
7370 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007371 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007372}
7373
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
7375static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007376do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007378 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007380
7381 i = 0;
7382 if (striptype != RIGHTSTRIP) {
7383 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7384 i++;
7385 }
7386 }
7387
7388 j = len;
7389 if (striptype != LEFTSTRIP) {
7390 do {
7391 j--;
7392 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7393 j++;
7394 }
7395
7396 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7397 Py_INCREF(self);
7398 return (PyObject*)self;
7399 }
7400 else
7401 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007404
7405static PyObject *
7406do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7407{
7408 PyObject *sep = NULL;
7409
7410 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7411 return NULL;
7412
7413 if (sep != NULL && sep != Py_None) {
7414 if (PyUnicode_Check(sep))
7415 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007416 else {
7417 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007418 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007419 STRIPNAME(striptype));
7420 return NULL;
7421 }
7422 }
7423
7424 return do_strip(self, striptype);
7425}
7426
7427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007429"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007430\n\
7431Return a copy of the string S with leading and trailing\n\
7432whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007433If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007434
7435static PyObject *
7436unicode_strip(PyUnicodeObject *self, PyObject *args)
7437{
7438 if (PyTuple_GET_SIZE(args) == 0)
7439 return do_strip(self, BOTHSTRIP); /* Common case */
7440 else
7441 return do_argstrip(self, BOTHSTRIP, args);
7442}
7443
7444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007446"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007447\n\
7448Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007449If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007450
7451static PyObject *
7452unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7453{
7454 if (PyTuple_GET_SIZE(args) == 0)
7455 return do_strip(self, LEFTSTRIP); /* Common case */
7456 else
7457 return do_argstrip(self, LEFTSTRIP, args);
7458}
7459
7460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007462"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007463\n\
7464Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007465If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007466
7467static PyObject *
7468unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7469{
7470 if (PyTuple_GET_SIZE(args) == 0)
7471 return do_strip(self, RIGHTSTRIP); /* Common case */
7472 else
7473 return do_argstrip(self, RIGHTSTRIP, args);
7474}
7475
7476
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
7480 PyUnicodeObject *u;
7481 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007482 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007483 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
7485 if (len < 0)
7486 len = 0;
7487
Tim Peters7a29bd52001-09-12 03:03:31 +00007488 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 /* no repeat, return original string */
7490 Py_INCREF(str);
7491 return (PyObject*) str;
7492 }
Tim Peters8f422462000-09-09 06:13:41 +00007493
7494 /* ensure # of chars needed doesn't overflow int and # of bytes
7495 * needed doesn't overflow size_t
7496 */
7497 nchars = len * str->length;
7498 if (len && nchars / len != str->length) {
7499 PyErr_SetString(PyExc_OverflowError,
7500 "repeated string is too long");
7501 return NULL;
7502 }
7503 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7504 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7505 PyErr_SetString(PyExc_OverflowError,
7506 "repeated string is too long");
7507 return NULL;
7508 }
7509 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 if (!u)
7511 return NULL;
7512
7513 p = u->str;
7514
Thomas Wouters477c8d52006-05-27 19:21:47 +00007515 if (str->length == 1 && len > 0) {
7516 Py_UNICODE_FILL(p, str->str[0], len);
7517 } else {
7518 Py_ssize_t done = 0; /* number of characters copied this far */
7519 if (done < nchars) {
7520 Py_UNICODE_COPY(p, str->str, str->length);
7521 done = str->length;
7522 }
7523 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007524 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007525 Py_UNICODE_COPY(p+done, p, n);
7526 done += n;
7527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 }
7529
7530 return (PyObject*) u;
7531}
7532
7533PyObject *PyUnicode_Replace(PyObject *obj,
7534 PyObject *subobj,
7535 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537{
7538 PyObject *self;
7539 PyObject *str1;
7540 PyObject *str2;
7541 PyObject *result;
7542
7543 self = PyUnicode_FromObject(obj);
7544 if (self == NULL)
7545 return NULL;
7546 str1 = PyUnicode_FromObject(subobj);
7547 if (str1 == NULL) {
7548 Py_DECREF(self);
7549 return NULL;
7550 }
7551 str2 = PyUnicode_FromObject(replobj);
7552 if (str2 == NULL) {
7553 Py_DECREF(self);
7554 Py_DECREF(str1);
7555 return NULL;
7556 }
Tim Petersced69f82003-09-16 20:30:58 +00007557 result = replace((PyUnicodeObject *)self,
7558 (PyUnicodeObject *)str1,
7559 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 maxcount);
7561 Py_DECREF(self);
7562 Py_DECREF(str1);
7563 Py_DECREF(str2);
7564 return result;
7565}
7566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007568"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569\n\
7570Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007571old replaced by new. If the optional argument count is\n\
7572given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
7574static PyObject*
7575unicode_replace(PyUnicodeObject *self, PyObject *args)
7576{
7577 PyUnicodeObject *str1;
7578 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007579 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 PyObject *result;
7581
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 return NULL;
7584 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7585 if (str1 == NULL)
7586 return NULL;
7587 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007588 if (str2 == NULL) {
7589 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593 result = replace(self, str1, str2, maxcount);
7594
7595 Py_DECREF(str1);
7596 Py_DECREF(str2);
7597 return result;
7598}
7599
7600static
7601PyObject *unicode_repr(PyObject *unicode)
7602{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007603 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007604 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007605 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7606 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7607
7608 /* XXX(nnorwitz): rather than over-allocating, it would be
7609 better to choose a different scheme. Perhaps scan the
7610 first N-chars of the string and allocate based on that size.
7611 */
7612 /* Initial allocation is based on the longest-possible unichr
7613 escape.
7614
7615 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7616 unichr, so in this case it's the longest unichr escape. In
7617 narrow (UTF-16) builds this is five chars per source unichr
7618 since there are two unichrs in the surrogate pair, so in narrow
7619 (UTF-16) builds it's not the longest unichr escape.
7620
7621 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7622 so in the narrow (UTF-16) build case it's the longest unichr
7623 escape.
7624 */
7625
Walter Dörwald1ab83302007-05-18 17:15:44 +00007626 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007627 2 /* quotes */
7628#ifdef Py_UNICODE_WIDE
7629 + 10*size
7630#else
7631 + 6*size
7632#endif
7633 + 1);
7634 if (repr == NULL)
7635 return NULL;
7636
Walter Dörwald1ab83302007-05-18 17:15:44 +00007637 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007638
7639 /* Add quote */
7640 *p++ = (findchar(s, size, '\'') &&
7641 !findchar(s, size, '"')) ? '"' : '\'';
7642 while (size-- > 0) {
7643 Py_UNICODE ch = *s++;
7644
7645 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007646 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007647 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007648 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007649 continue;
7650 }
7651
Georg Brandl559e5d72008-06-11 18:37:52 +00007652 /* Map special whitespace to '\t', \n', '\r' */
7653 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007654 *p++ = '\\';
7655 *p++ = 't';
7656 }
7657 else if (ch == '\n') {
7658 *p++ = '\\';
7659 *p++ = 'n';
7660 }
7661 else if (ch == '\r') {
7662 *p++ = '\\';
7663 *p++ = 'r';
7664 }
7665
7666 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007667 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007668 *p++ = '\\';
7669 *p++ = 'x';
7670 *p++ = hexdigits[(ch >> 4) & 0x000F];
7671 *p++ = hexdigits[ch & 0x000F];
7672 }
7673
Georg Brandl559e5d72008-06-11 18:37:52 +00007674 /* Copy ASCII characters as-is */
7675 else if (ch < 0x7F) {
7676 *p++ = ch;
7677 }
7678
7679 /* Non-ASCII characters */
7680 else {
7681 Py_UCS4 ucs = ch;
7682
7683#ifndef Py_UNICODE_WIDE
7684 Py_UNICODE ch2 = 0;
7685 /* Get code point from surrogate pair */
7686 if (size > 0) {
7687 ch2 = *s;
7688 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7689 && ch2 <= 0xDFFF) {
7690 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7691 + 0x00010000;
7692 s++;
7693 size--;
7694 }
7695 }
7696#endif
7697 /* Map Unicode whitespace and control characters
7698 (categories Z* and C* except ASCII space)
7699 */
7700 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7701 /* Map 8-bit characters to '\xhh' */
7702 if (ucs <= 0xff) {
7703 *p++ = '\\';
7704 *p++ = 'x';
7705 *p++ = hexdigits[(ch >> 4) & 0x000F];
7706 *p++ = hexdigits[ch & 0x000F];
7707 }
7708 /* Map 21-bit characters to '\U00xxxxxx' */
7709 else if (ucs >= 0x10000) {
7710 *p++ = '\\';
7711 *p++ = 'U';
7712 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7713 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7714 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7715 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7716 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7717 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7718 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7719 *p++ = hexdigits[ucs & 0x0000000F];
7720 }
7721 /* Map 16-bit characters to '\uxxxx' */
7722 else {
7723 *p++ = '\\';
7724 *p++ = 'u';
7725 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7726 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7727 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7728 *p++ = hexdigits[ucs & 0x000F];
7729 }
7730 }
7731 /* Copy characters as-is */
7732 else {
7733 *p++ = ch;
7734#ifndef Py_UNICODE_WIDE
7735 if (ucs >= 0x10000)
7736 *p++ = ch2;
7737#endif
7738 }
7739 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007740 }
7741 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007742 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007743
7744 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007745 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007746 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747}
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007750"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751\n\
7752Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007753such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754arguments start and end are interpreted as in slice notation.\n\
7755\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
7758static PyObject *
7759unicode_rfind(PyUnicodeObject *self, PyObject *args)
7760{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007761 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007762 Py_ssize_t start;
7763 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007764 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
Christian Heimes9cd17752007-11-18 19:35:23 +00007766 if (!_ParseTupleFinds(args, &substring, &start, &end))
7767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768
Thomas Wouters477c8d52006-05-27 19:21:47 +00007769 result = stringlib_rfind_slice(
7770 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7771 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7772 start, end
7773 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774
7775 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007776
Christian Heimes217cfd12007-12-02 14:31:20 +00007777 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007781"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007783Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
7785static PyObject *
7786unicode_rindex(PyUnicodeObject *self, PyObject *args)
7787{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007788 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007789 Py_ssize_t start;
7790 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007791 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
Christian Heimes9cd17752007-11-18 19:35:23 +00007793 if (!_ParseTupleFinds(args, &substring, &start, &end))
7794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
Thomas Wouters477c8d52006-05-27 19:21:47 +00007796 result = stringlib_rfind_slice(
7797 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7798 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7799 start, end
7800 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
7802 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007803
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 if (result < 0) {
7805 PyErr_SetString(PyExc_ValueError, "substring not found");
7806 return NULL;
7807 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007808 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809}
7810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007811PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007812"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007814Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007815done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816
7817static PyObject *
7818unicode_rjust(PyUnicodeObject *self, PyObject *args)
7819{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007820 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007821 Py_UNICODE fillchar = ' ';
7822
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007823 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 return NULL;
7825
Tim Peters7a29bd52001-09-12 03:03:31 +00007826 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 Py_INCREF(self);
7828 return (PyObject*) self;
7829 }
7830
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007831 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832}
7833
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834PyObject *PyUnicode_Split(PyObject *s,
7835 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837{
7838 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007839
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 s = PyUnicode_FromObject(s);
7841 if (s == NULL)
7842 return NULL;
7843 if (sep != NULL) {
7844 sep = PyUnicode_FromObject(sep);
7845 if (sep == NULL) {
7846 Py_DECREF(s);
7847 return NULL;
7848 }
7849 }
7850
7851 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7852
7853 Py_DECREF(s);
7854 Py_XDECREF(sep);
7855 return result;
7856}
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007859"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860\n\
7861Return a list of the words in S, using sep as the\n\
7862delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007863splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007864whitespace string is a separator and empty strings are\n\
7865removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
7867static PyObject*
7868unicode_split(PyUnicodeObject *self, PyObject *args)
7869{
7870 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 return NULL;
7875
7876 if (substring == Py_None)
7877 return split(self, NULL, maxcount);
7878 else if (PyUnicode_Check(substring))
7879 return split(self, (PyUnicodeObject *)substring, maxcount);
7880 else
7881 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7882}
7883
Thomas Wouters477c8d52006-05-27 19:21:47 +00007884PyObject *
7885PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7886{
7887 PyObject* str_obj;
7888 PyObject* sep_obj;
7889 PyObject* out;
7890
7891 str_obj = PyUnicode_FromObject(str_in);
7892 if (!str_obj)
7893 return NULL;
7894 sep_obj = PyUnicode_FromObject(sep_in);
7895 if (!sep_obj) {
7896 Py_DECREF(str_obj);
7897 return NULL;
7898 }
7899
7900 out = stringlib_partition(
7901 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7902 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7903 );
7904
7905 Py_DECREF(sep_obj);
7906 Py_DECREF(str_obj);
7907
7908 return out;
7909}
7910
7911
7912PyObject *
7913PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7914{
7915 PyObject* str_obj;
7916 PyObject* sep_obj;
7917 PyObject* out;
7918
7919 str_obj = PyUnicode_FromObject(str_in);
7920 if (!str_obj)
7921 return NULL;
7922 sep_obj = PyUnicode_FromObject(sep_in);
7923 if (!sep_obj) {
7924 Py_DECREF(str_obj);
7925 return NULL;
7926 }
7927
7928 out = stringlib_rpartition(
7929 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7930 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7931 );
7932
7933 Py_DECREF(sep_obj);
7934 Py_DECREF(str_obj);
7935
7936 return out;
7937}
7938
7939PyDoc_STRVAR(partition__doc__,
7940"S.partition(sep) -> (head, sep, tail)\n\
7941\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007942Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007943the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007944found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007945
7946static PyObject*
7947unicode_partition(PyUnicodeObject *self, PyObject *separator)
7948{
7949 return PyUnicode_Partition((PyObject *)self, separator);
7950}
7951
7952PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007953"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007955Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007956the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007957separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007958
7959static PyObject*
7960unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7961{
7962 return PyUnicode_RPartition((PyObject *)self, separator);
7963}
7964
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007965PyObject *PyUnicode_RSplit(PyObject *s,
7966 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007967 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007968{
7969 PyObject *result;
7970
7971 s = PyUnicode_FromObject(s);
7972 if (s == NULL)
7973 return NULL;
7974 if (sep != NULL) {
7975 sep = PyUnicode_FromObject(sep);
7976 if (sep == NULL) {
7977 Py_DECREF(s);
7978 return NULL;
7979 }
7980 }
7981
7982 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7983
7984 Py_DECREF(s);
7985 Py_XDECREF(sep);
7986 return result;
7987}
7988
7989PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007990"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007991\n\
7992Return a list of the words in S, using sep as the\n\
7993delimiter string, starting at the end of the string and\n\
7994working to the front. If maxsplit is given, at most maxsplit\n\
7995splits are done. If sep is not specified, any whitespace string\n\
7996is a separator.");
7997
7998static PyObject*
7999unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8000{
8001 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008002 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008003
Martin v. Löwis18e16552006-02-15 17:27:45 +00008004 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008005 return NULL;
8006
8007 if (substring == Py_None)
8008 return rsplit(self, NULL, maxcount);
8009 else if (PyUnicode_Check(substring))
8010 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8011 else
8012 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8013}
8014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008015PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson4469d0c2008-11-30 22:46:23 +00008016"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017\n\
8018Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008019Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008020is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
8022static PyObject*
8023unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8024{
Guido van Rossum86662912000-04-11 15:38:46 +00008025 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
Guido van Rossum86662912000-04-11 15:38:46 +00008027 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 return NULL;
8029
Guido van Rossum86662912000-04-11 15:38:46 +00008030 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031}
8032
8033static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008034PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
Walter Dörwald346737f2007-05-31 10:44:43 +00008036 if (PyUnicode_CheckExact(self)) {
8037 Py_INCREF(self);
8038 return self;
8039 } else
8040 /* Subtype -- return genuine unicode string with the same value. */
8041 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8042 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043}
8044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008045PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008046"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047\n\
8048Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008049and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
8051static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008052unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return fixup(self, fixswapcase);
8055}
8056
Georg Brandlceee0772007-11-27 23:48:05 +00008057PyDoc_STRVAR(maketrans__doc__,
8058"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8059\n\
8060Return a translation table usable for str.translate().\n\
8061If there is only one argument, it must be a dictionary mapping Unicode\n\
8062ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008063Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008064If there are two arguments, they must be strings of equal length, and\n\
8065in the resulting dictionary, each character in x will be mapped to the\n\
8066character at the same position in y. If there is a third argument, it\n\
8067must be a string, whose characters will be mapped to None in the result.");
8068
8069static PyObject*
8070unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8071{
8072 PyObject *x, *y = NULL, *z = NULL;
8073 PyObject *new = NULL, *key, *value;
8074 Py_ssize_t i = 0;
8075 int res;
8076
8077 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8078 return NULL;
8079 new = PyDict_New();
8080 if (!new)
8081 return NULL;
8082 if (y != NULL) {
8083 /* x must be a string too, of equal length */
8084 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8085 if (!PyUnicode_Check(x)) {
8086 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8087 "be a string if there is a second argument");
8088 goto err;
8089 }
8090 if (PyUnicode_GET_SIZE(x) != ylen) {
8091 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8092 "arguments must have equal length");
8093 goto err;
8094 }
8095 /* create entries for translating chars in x to those in y */
8096 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008097 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8098 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008099 if (!key || !value)
8100 goto err;
8101 res = PyDict_SetItem(new, key, value);
8102 Py_DECREF(key);
8103 Py_DECREF(value);
8104 if (res < 0)
8105 goto err;
8106 }
8107 /* create entries for deleting chars in z */
8108 if (z != NULL) {
8109 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008110 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008111 if (!key)
8112 goto err;
8113 res = PyDict_SetItem(new, key, Py_None);
8114 Py_DECREF(key);
8115 if (res < 0)
8116 goto err;
8117 }
8118 }
8119 } else {
8120 /* x must be a dict */
8121 if (!PyDict_Check(x)) {
8122 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8123 "to maketrans it must be a dict");
8124 goto err;
8125 }
8126 /* copy entries into the new dict, converting string keys to int keys */
8127 while (PyDict_Next(x, &i, &key, &value)) {
8128 if (PyUnicode_Check(key)) {
8129 /* convert string keys to integer keys */
8130 PyObject *newkey;
8131 if (PyUnicode_GET_SIZE(key) != 1) {
8132 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8133 "table must be of length 1");
8134 goto err;
8135 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008136 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008137 if (!newkey)
8138 goto err;
8139 res = PyDict_SetItem(new, newkey, value);
8140 Py_DECREF(newkey);
8141 if (res < 0)
8142 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008143 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008144 /* just keep integer keys */
8145 if (PyDict_SetItem(new, key, value) < 0)
8146 goto err;
8147 } else {
8148 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8149 "be strings or integers");
8150 goto err;
8151 }
8152 }
8153 }
8154 return new;
8155 err:
8156 Py_DECREF(new);
8157 return NULL;
8158}
8159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008160PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008161"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162\n\
8163Return a copy of the string S, where all characters have been mapped\n\
8164through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008165Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008166Unmapped characters are left untouched. Characters mapped to None\n\
8167are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
8169static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008170unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
Georg Brandlceee0772007-11-27 23:48:05 +00008172 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173}
8174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008175PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008176"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008178Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179
8180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008181unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 return fixup(self, fixupper);
8184}
8185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008186PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008187"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008189Pad a numeric string S with zeros on the left, to fill a field\n\
8190of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192static PyObject *
8193unicode_zfill(PyUnicodeObject *self, PyObject *args)
8194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008195 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 PyUnicodeObject *u;
8197
Martin v. Löwis18e16552006-02-15 17:27:45 +00008198 Py_ssize_t width;
8199 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 return NULL;
8201
8202 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008203 if (PyUnicode_CheckExact(self)) {
8204 Py_INCREF(self);
8205 return (PyObject*) self;
8206 }
8207 else
8208 return PyUnicode_FromUnicode(
8209 PyUnicode_AS_UNICODE(self),
8210 PyUnicode_GET_SIZE(self)
8211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 }
8213
8214 fill = width - self->length;
8215
8216 u = pad(self, fill, 0, '0');
8217
Walter Dörwald068325e2002-04-15 13:36:47 +00008218 if (u == NULL)
8219 return NULL;
8220
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 if (u->str[fill] == '+' || u->str[fill] == '-') {
8222 /* move sign to beginning of string */
8223 u->str[0] = u->str[fill];
8224 u->str[fill] = '0';
8225 }
8226
8227 return (PyObject*) u;
8228}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229
8230#if 0
8231static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008232unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233{
Christian Heimes2202f872008-02-06 14:31:34 +00008234 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235}
8236#endif
8237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008238PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008239"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008241Return True if S starts with the specified prefix, False otherwise.\n\
8242With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008243With optional end, stop comparing S at that position.\n\
8244prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
8246static PyObject *
8247unicode_startswith(PyUnicodeObject *self,
8248 PyObject *args)
8249{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008250 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008252 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008253 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008254 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008256 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008257 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008259 if (PyTuple_Check(subobj)) {
8260 Py_ssize_t i;
8261 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8262 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8263 PyTuple_GET_ITEM(subobj, i));
8264 if (substring == NULL)
8265 return NULL;
8266 result = tailmatch(self, substring, start, end, -1);
8267 Py_DECREF(substring);
8268 if (result) {
8269 Py_RETURN_TRUE;
8270 }
8271 }
8272 /* nothing matched */
8273 Py_RETURN_FALSE;
8274 }
8275 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008277 return NULL;
8278 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008280 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281}
8282
8283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008284PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008285"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008287Return True if S ends with the specified suffix, False otherwise.\n\
8288With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008289With optional end, stop comparing S at that position.\n\
8290suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291
8292static PyObject *
8293unicode_endswith(PyUnicodeObject *self,
8294 PyObject *args)
8295{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008296 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008298 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008299 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008300 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008302 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8303 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008305 if (PyTuple_Check(subobj)) {
8306 Py_ssize_t i;
8307 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8308 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8309 PyTuple_GET_ITEM(subobj, i));
8310 if (substring == NULL)
8311 return NULL;
8312 result = tailmatch(self, substring, start, end, +1);
8313 Py_DECREF(substring);
8314 if (result) {
8315 Py_RETURN_TRUE;
8316 }
8317 }
8318 Py_RETURN_FALSE;
8319 }
8320 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008324 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008326 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327}
8328
Eric Smith8c663262007-08-25 02:26:07 +00008329#include "stringlib/string_format.h"
8330
8331PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008332"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008333\n\
8334");
8335
Eric Smith4a7d76d2008-05-30 18:10:19 +00008336static PyObject *
8337unicode__format__(PyObject* self, PyObject* args)
8338{
8339 PyObject *format_spec;
8340
8341 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8342 return NULL;
8343
8344 return _PyUnicode_FormatAdvanced(self,
8345 PyUnicode_AS_UNICODE(format_spec),
8346 PyUnicode_GET_SIZE(format_spec));
8347}
8348
Eric Smith8c663262007-08-25 02:26:07 +00008349PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008350"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008351\n\
8352");
8353
8354static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008355unicode__sizeof__(PyUnicodeObject *v)
8356{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008357 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8358 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008359}
8360
8361PyDoc_STRVAR(sizeof__doc__,
8362"S.__sizeof__() -> size of S in memory, in bytes");
8363
8364static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008365unicode_getnewargs(PyUnicodeObject *v)
8366{
8367 return Py_BuildValue("(u#)", v->str, v->length);
8368}
8369
8370
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371static PyMethodDef unicode_methods[] = {
8372
8373 /* Order is according to common usage: often used methods should
8374 appear first, since lookup is done sequentially. */
8375
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008376 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8377 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8378 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008379 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008380 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8381 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8382 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8383 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8384 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8385 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8386 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008388 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8389 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8390 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008391 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008392 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8393 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8394 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008395 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008397 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008398 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008399 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8400 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8401 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8402 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8403 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8404 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8405 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8406 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8407 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8408 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8409 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8410 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8411 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8412 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008413 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008414 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008415 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008416 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008417 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008418 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8419 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008420 {"maketrans", (PyCFunction) unicode_maketrans,
8421 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008422 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008423#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008424 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425#endif
8426
8427#if 0
8428 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008429 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430#endif
8431
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008432 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 {NULL, NULL}
8434};
8435
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008436static PyObject *
8437unicode_mod(PyObject *v, PyObject *w)
8438{
8439 if (!PyUnicode_Check(v)) {
8440 Py_INCREF(Py_NotImplemented);
8441 return Py_NotImplemented;
8442 }
8443 return PyUnicode_Format(v, w);
8444}
8445
8446static PyNumberMethods unicode_as_number = {
8447 0, /*nb_add*/
8448 0, /*nb_subtract*/
8449 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008450 unicode_mod, /*nb_remainder*/
8451};
8452
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008454 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008455 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008456 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8457 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008458 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 0, /* sq_ass_item */
8460 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008461 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462};
8463
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008464static PyObject*
8465unicode_subscript(PyUnicodeObject* self, PyObject* item)
8466{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008467 if (PyIndex_Check(item)) {
8468 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008469 if (i == -1 && PyErr_Occurred())
8470 return NULL;
8471 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008472 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008473 return unicode_getitem(self, i);
8474 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008475 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008476 Py_UNICODE* source_buf;
8477 Py_UNICODE* result_buf;
8478 PyObject* result;
8479
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008480 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008481 &start, &stop, &step, &slicelength) < 0) {
8482 return NULL;
8483 }
8484
8485 if (slicelength <= 0) {
8486 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008487 } else if (start == 0 && step == 1 && slicelength == self->length &&
8488 PyUnicode_CheckExact(self)) {
8489 Py_INCREF(self);
8490 return (PyObject *)self;
8491 } else if (step == 1) {
8492 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008493 } else {
8494 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008495 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8496 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008497
8498 if (result_buf == NULL)
8499 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008500
8501 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8502 result_buf[i] = source_buf[cur];
8503 }
Tim Petersced69f82003-09-16 20:30:58 +00008504
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008505 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008506 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008507 return result;
8508 }
8509 } else {
8510 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8511 return NULL;
8512 }
8513}
8514
8515static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008516 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008517 (binaryfunc)unicode_subscript, /* mp_subscript */
8518 (objobjargproc)0, /* mp_ass_subscript */
8519};
8520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522/* Helpers for PyUnicode_Format() */
8523
8524static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008525getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (argidx < arglen) {
8529 (*p_argidx)++;
8530 if (arglen < 0)
8531 return args;
8532 else
8533 return PyTuple_GetItem(args, argidx);
8534 }
8535 PyErr_SetString(PyExc_TypeError,
8536 "not enough arguments for format string");
8537 return NULL;
8538}
8539
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008541strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 register Py_ssize_t i;
8544 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 for (i = len - 1; i >= 0; i--)
8546 buffer[i] = (Py_UNICODE) charbuffer[i];
8547
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 return len;
8549}
8550
Neal Norwitzfc76d632006-01-10 06:03:13 +00008551static int
8552doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8553{
Tim Peters15231542006-02-16 01:08:01 +00008554 Py_ssize_t result;
8555
Neal Norwitzfc76d632006-01-10 06:03:13 +00008556 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008557 result = strtounicode(buffer, (char *)buffer);
8558 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008559}
8560
Christian Heimes3fd13992008-03-21 01:05:49 +00008561#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008562static int
8563longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8564{
Tim Peters15231542006-02-16 01:08:01 +00008565 Py_ssize_t result;
8566
Neal Norwitzfc76d632006-01-10 06:03:13 +00008567 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008568 result = strtounicode(buffer, (char *)buffer);
8569 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008570}
Christian Heimes3fd13992008-03-21 01:05:49 +00008571#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008572
Guido van Rossum078151d2002-08-11 04:24:12 +00008573/* XXX To save some code duplication, formatfloat/long/int could have been
8574 shared with stringobject.c, converting from 8-bit to Unicode after the
8575 formatting is done. */
8576
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577static int
8578formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008579 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 int flags,
8581 int prec,
8582 int type,
8583 PyObject *v)
8584{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008585 /* fmt = '%#.' + `prec` + `type`
8586 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 char fmt[20];
8588 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008589
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 x = PyFloat_AsDouble(v);
8591 if (x == -1.0 && PyErr_Occurred())
8592 return -1;
8593 if (prec < 0)
8594 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008595 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8596 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008597 /* Worst case length calc to ensure no buffer overrun:
8598
8599 'g' formats:
8600 fmt = %#.<prec>g
8601 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8602 for any double rep.)
8603 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8604
8605 'f' formats:
8606 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8607 len = 1 + 50 + 1 + prec = 52 + prec
8608
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008609 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008610 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008611
8612 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008613 if (((type == 'g' || type == 'G') &&
8614 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008615 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008616 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008617 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008618 return -1;
8619 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008620 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8621 (flags&F_ALT) ? "#" : "",
8622 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008623 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
8625
Tim Peters38fd5b62000-09-21 05:43:11 +00008626static PyObject*
8627formatlong(PyObject *val, int flags, int prec, int type)
8628{
8629 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008630 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008631 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008632 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008633
Christian Heimes72b710a2008-05-26 13:28:38 +00008634 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008635 if (!str)
8636 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008637 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008638 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008639 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008640}
8641
Christian Heimes3fd13992008-03-21 01:05:49 +00008642#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643static int
8644formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008645 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 int flags,
8647 int prec,
8648 int type,
8649 PyObject *v)
8650{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008651 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008652 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8653 * + 1 + 1
8654 * = 24
8655 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008656 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008657 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 long x;
8659
Christian Heimes217cfd12007-12-02 14:31:20 +00008660 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008662 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008663 if (x < 0 && type == 'u') {
8664 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008665 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008666 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8667 sign = "-";
8668 else
8669 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008671 prec = 1;
8672
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008673 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8674 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008675 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008676 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008677 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008678 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008679 return -1;
8680 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008681
8682 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008683 (type == 'x' || type == 'X' || type == 'o')) {
8684 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008685 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008686 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008687 * - when 0 is being converted, the C standard leaves off
8688 * the '0x' or '0X', which is inconsistent with other
8689 * %#x/%#X conversions and inconsistent with Python's
8690 * hex() function
8691 * - there are platforms that violate the standard and
8692 * convert 0 with the '0x' or '0X'
8693 * (Metrowerks, Compaq Tru64)
8694 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008695 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008696 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008697 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008698 * We can achieve the desired consistency by inserting our
8699 * own '0x' or '0X' prefix, and substituting %x/%X in place
8700 * of %#x/%#X.
8701 *
8702 * Note that this is the same approach as used in
8703 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008704 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008705 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8706 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008707 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008708 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008709 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8710 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008711 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008712 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008713 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008714 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008715 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008716 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717}
Christian Heimes3fd13992008-03-21 01:05:49 +00008718#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
8720static int
8721formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008722 size_t buflen,
8723 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008725 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008726 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008727 if (PyUnicode_GET_SIZE(v) == 1) {
8728 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8729 buf[1] = '\0';
8730 return 1;
8731 }
8732#ifndef Py_UNICODE_WIDE
8733 if (PyUnicode_GET_SIZE(v) == 2) {
8734 /* Decode a valid surrogate pair */
8735 int c0 = PyUnicode_AS_UNICODE(v)[0];
8736 int c1 = PyUnicode_AS_UNICODE(v)[1];
8737 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8738 0xDC00 <= c1 && c1 <= 0xDFFF) {
8739 buf[0] = c0;
8740 buf[1] = c1;
8741 buf[2] = '\0';
8742 return 2;
8743 }
8744 }
8745#endif
8746 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 else {
8749 /* Integer input truncated to a character */
8750 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008751 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008753 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008754
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008755 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008756 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008757 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008758 return -1;
8759 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008760
8761#ifndef Py_UNICODE_WIDE
8762 if (x > 0xffff) {
8763 x -= 0x10000;
8764 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8765 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8766 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008767 }
8768#endif
8769 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008770 buf[1] = '\0';
8771 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008773
8774 onError:
8775 PyErr_SetString(PyExc_TypeError,
8776 "%c requires int or char");
8777 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778}
8779
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008780/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8781
8782 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8783 chars are formatted. XXX This is a magic number. Each formatting
8784 routine does bounds checking to ensure no overflow, but a better
8785 solution may be to malloc a buffer of appropriate size for each
8786 format. For now, the current solution is sufficient.
8787*/
8788#define FORMATBUFLEN (size_t)120
8789
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790PyObject *PyUnicode_Format(PyObject *format,
8791 PyObject *args)
8792{
8793 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008794 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 int args_owned = 0;
8796 PyUnicodeObject *result = NULL;
8797 PyObject *dict = NULL;
8798 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 if (format == NULL || args == NULL) {
8801 PyErr_BadInternalCall();
8802 return NULL;
8803 }
8804 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008805 if (uformat == NULL)
8806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 fmt = PyUnicode_AS_UNICODE(uformat);
8808 fmtcnt = PyUnicode_GET_SIZE(uformat);
8809
8810 reslen = rescnt = fmtcnt + 100;
8811 result = _PyUnicode_New(reslen);
8812 if (result == NULL)
8813 goto onError;
8814 res = PyUnicode_AS_UNICODE(result);
8815
8816 if (PyTuple_Check(args)) {
8817 arglen = PyTuple_Size(args);
8818 argidx = 0;
8819 }
8820 else {
8821 arglen = -1;
8822 argidx = -2;
8823 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008824 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008825 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 dict = args;
8827
8828 while (--fmtcnt >= 0) {
8829 if (*fmt != '%') {
8830 if (--rescnt < 0) {
8831 rescnt = fmtcnt + 100;
8832 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008833 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8836 --rescnt;
8837 }
8838 *res++ = *fmt++;
8839 }
8840 else {
8841 /* Got a format specifier */
8842 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008843 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845 Py_UNICODE c = '\0';
8846 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008847 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 PyObject *v = NULL;
8849 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008850 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008852 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008853 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854
8855 fmt++;
8856 if (*fmt == '(') {
8857 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008858 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 PyObject *key;
8860 int pcount = 1;
8861
8862 if (dict == NULL) {
8863 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008864 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 goto onError;
8866 }
8867 ++fmt;
8868 --fmtcnt;
8869 keystart = fmt;
8870 /* Skip over balanced parentheses */
8871 while (pcount > 0 && --fmtcnt >= 0) {
8872 if (*fmt == ')')
8873 --pcount;
8874 else if (*fmt == '(')
8875 ++pcount;
8876 fmt++;
8877 }
8878 keylen = fmt - keystart - 1;
8879 if (fmtcnt < 0 || pcount > 0) {
8880 PyErr_SetString(PyExc_ValueError,
8881 "incomplete format key");
8882 goto onError;
8883 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008884#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008885 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 then looked up since Python uses strings to hold
8887 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008888 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 key = PyUnicode_EncodeUTF8(keystart,
8890 keylen,
8891 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008892#else
8893 key = PyUnicode_FromUnicode(keystart, keylen);
8894#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 if (key == NULL)
8896 goto onError;
8897 if (args_owned) {
8898 Py_DECREF(args);
8899 args_owned = 0;
8900 }
8901 args = PyObject_GetItem(dict, key);
8902 Py_DECREF(key);
8903 if (args == NULL) {
8904 goto onError;
8905 }
8906 args_owned = 1;
8907 arglen = -1;
8908 argidx = -2;
8909 }
8910 while (--fmtcnt >= 0) {
8911 switch (c = *fmt++) {
8912 case '-': flags |= F_LJUST; continue;
8913 case '+': flags |= F_SIGN; continue;
8914 case ' ': flags |= F_BLANK; continue;
8915 case '#': flags |= F_ALT; continue;
8916 case '0': flags |= F_ZERO; continue;
8917 }
8918 break;
8919 }
8920 if (c == '*') {
8921 v = getnextarg(args, arglen, &argidx);
8922 if (v == NULL)
8923 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008924 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 PyErr_SetString(PyExc_TypeError,
8926 "* wants int");
8927 goto onError;
8928 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008929 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008930 if (width == -1 && PyErr_Occurred())
8931 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 if (width < 0) {
8933 flags |= F_LJUST;
8934 width = -width;
8935 }
8936 if (--fmtcnt >= 0)
8937 c = *fmt++;
8938 }
8939 else if (c >= '0' && c <= '9') {
8940 width = c - '0';
8941 while (--fmtcnt >= 0) {
8942 c = *fmt++;
8943 if (c < '0' || c > '9')
8944 break;
8945 if ((width*10) / 10 != width) {
8946 PyErr_SetString(PyExc_ValueError,
8947 "width too big");
8948 goto onError;
8949 }
8950 width = width*10 + (c - '0');
8951 }
8952 }
8953 if (c == '.') {
8954 prec = 0;
8955 if (--fmtcnt >= 0)
8956 c = *fmt++;
8957 if (c == '*') {
8958 v = getnextarg(args, arglen, &argidx);
8959 if (v == NULL)
8960 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008961 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 PyErr_SetString(PyExc_TypeError,
8963 "* wants int");
8964 goto onError;
8965 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008966 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008967 if (prec == -1 && PyErr_Occurred())
8968 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 if (prec < 0)
8970 prec = 0;
8971 if (--fmtcnt >= 0)
8972 c = *fmt++;
8973 }
8974 else if (c >= '0' && c <= '9') {
8975 prec = c - '0';
8976 while (--fmtcnt >= 0) {
8977 c = Py_CHARMASK(*fmt++);
8978 if (c < '0' || c > '9')
8979 break;
8980 if ((prec*10) / 10 != prec) {
8981 PyErr_SetString(PyExc_ValueError,
8982 "prec too big");
8983 goto onError;
8984 }
8985 prec = prec*10 + (c - '0');
8986 }
8987 }
8988 } /* prec */
8989 if (fmtcnt >= 0) {
8990 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 if (--fmtcnt >= 0)
8992 c = *fmt++;
8993 }
8994 }
8995 if (fmtcnt < 0) {
8996 PyErr_SetString(PyExc_ValueError,
8997 "incomplete format");
8998 goto onError;
8999 }
9000 if (c != '%') {
9001 v = getnextarg(args, arglen, &argidx);
9002 if (v == NULL)
9003 goto onError;
9004 }
9005 sign = 0;
9006 fill = ' ';
9007 switch (c) {
9008
9009 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009010 pbuf = formatbuf;
9011 /* presume that buffer length is at least 1 */
9012 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 len = 1;
9014 break;
9015
9016 case 's':
9017 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009018 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 if (PyUnicode_Check(v) && c == 's') {
9020 temp = v;
9021 Py_INCREF(temp);
9022 }
9023 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009025 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009026 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009028 else
9029 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 if (temp == NULL)
9031 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009032 if (PyUnicode_Check(temp))
9033 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009034 else {
9035 Py_DECREF(temp);
9036 PyErr_SetString(PyExc_TypeError,
9037 "%s argument has non-string str()");
9038 goto onError;
9039 }
9040 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009041 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 len = PyUnicode_GET_SIZE(temp);
9043 if (prec >= 0 && len > prec)
9044 len = prec;
9045 break;
9046
9047 case 'i':
9048 case 'd':
9049 case 'u':
9050 case 'o':
9051 case 'x':
9052 case 'X':
9053 if (c == 'i')
9054 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009055 isnumok = 0;
9056 if (PyNumber_Check(v)) {
9057 PyObject *iobj=NULL;
9058
9059 if (PyLong_Check(v)) {
9060 iobj = v;
9061 Py_INCREF(iobj);
9062 }
9063 else {
9064 iobj = PyNumber_Long(v);
9065 }
9066 if (iobj!=NULL) {
9067 if (PyLong_Check(iobj)) {
9068 isnumok = 1;
9069 temp = formatlong(iobj, flags, prec, c);
9070 Py_DECREF(iobj);
9071 if (!temp)
9072 goto onError;
9073 pbuf = PyUnicode_AS_UNICODE(temp);
9074 len = PyUnicode_GET_SIZE(temp);
9075 sign = 1;
9076 }
9077 else {
9078 Py_DECREF(iobj);
9079 }
9080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009082 if (!isnumok) {
9083 PyErr_Format(PyExc_TypeError,
9084 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009085 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009086 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009087 }
9088 if (flags & F_ZERO)
9089 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 break;
9091
9092 case 'e':
9093 case 'E':
9094 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009095 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 case 'g':
9097 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009098 if (c == 'F')
9099 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009100 pbuf = formatbuf;
9101 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9102 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 if (len < 0)
9104 goto onError;
9105 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009106 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 fill = '0';
9108 break;
9109
9110 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009111 pbuf = formatbuf;
9112 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 if (len < 0)
9114 goto onError;
9115 break;
9116
9117 default:
9118 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009119 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009120 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009121 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009122 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009123 (Py_ssize_t)(fmt - 1 -
9124 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 goto onError;
9126 }
9127 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009128 if (*pbuf == '-' || *pbuf == '+') {
9129 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 len--;
9131 }
9132 else if (flags & F_SIGN)
9133 sign = '+';
9134 else if (flags & F_BLANK)
9135 sign = ' ';
9136 else
9137 sign = 0;
9138 }
9139 if (width < len)
9140 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009141 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 reslen -= rescnt;
9143 rescnt = width + fmtcnt + 100;
9144 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009145 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009146 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009147 PyErr_NoMemory();
9148 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009149 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009150 if (_PyUnicode_Resize(&result, reslen) < 0) {
9151 Py_XDECREF(temp);
9152 goto onError;
9153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 res = PyUnicode_AS_UNICODE(result)
9155 + reslen - rescnt;
9156 }
9157 if (sign) {
9158 if (fill != ' ')
9159 *res++ = sign;
9160 rescnt--;
9161 if (width > len)
9162 width--;
9163 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009164 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009165 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009166 assert(pbuf[1] == c);
9167 if (fill != ' ') {
9168 *res++ = *pbuf++;
9169 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009170 }
Tim Petersfff53252001-04-12 18:38:48 +00009171 rescnt -= 2;
9172 width -= 2;
9173 if (width < 0)
9174 width = 0;
9175 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 if (width > len && !(flags & F_LJUST)) {
9178 do {
9179 --rescnt;
9180 *res++ = fill;
9181 } while (--width > len);
9182 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009183 if (fill == ' ') {
9184 if (sign)
9185 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009186 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009187 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009188 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009189 *res++ = *pbuf++;
9190 *res++ = *pbuf++;
9191 }
9192 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009193 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 res += len;
9195 rescnt -= len;
9196 while (--width >= len) {
9197 --rescnt;
9198 *res++ = ' ';
9199 }
9200 if (dict && (argidx < arglen) && c != '%') {
9201 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009202 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009203 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204 goto onError;
9205 }
9206 Py_XDECREF(temp);
9207 } /* '%' */
9208 } /* until end */
9209 if (argidx < arglen && !dict) {
9210 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009211 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212 goto onError;
9213 }
9214
Thomas Woutersa96affe2006-03-12 00:29:36 +00009215 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9216 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 if (args_owned) {
9218 Py_DECREF(args);
9219 }
9220 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 return (PyObject *)result;
9222
9223 onError:
9224 Py_XDECREF(result);
9225 Py_DECREF(uformat);
9226 if (args_owned) {
9227 Py_DECREF(args);
9228 }
9229 return NULL;
9230}
9231
Jeremy Hylton938ace62002-07-17 16:30:39 +00009232static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009233unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9234
Tim Peters6d6c1a32001-08-02 04:15:00 +00009235static PyObject *
9236unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9237{
9238 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009239 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009240 char *encoding = NULL;
9241 char *errors = NULL;
9242
Guido van Rossume023fe02001-08-30 03:12:59 +00009243 if (type != &PyUnicode_Type)
9244 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009245 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009246 kwlist, &x, &encoding, &errors))
9247 return NULL;
9248 if (x == NULL)
9249 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009250 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009251 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009252 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009253 return PyUnicode_FromEncodedObject(x, encoding, errors);
9254}
9255
Guido van Rossume023fe02001-08-30 03:12:59 +00009256static PyObject *
9257unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9258{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009259 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009260 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009261
9262 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9263 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9264 if (tmp == NULL)
9265 return NULL;
9266 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009267 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009268 if (pnew == NULL) {
9269 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009270 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009271 }
Christian Heimesb186d002008-03-18 15:15:01 +00009272 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009273 if (pnew->str == NULL) {
9274 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009275 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009276 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009277 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009278 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009279 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9280 pnew->length = n;
9281 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009282 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009283 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009284}
9285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009286PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009287"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009288\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009289Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009290encoding defaults to the current default string encoding.\n\
9291errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009292
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009293static PyObject *unicode_iter(PyObject *seq);
9294
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009296 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009297 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 sizeof(PyUnicodeObject), /* tp_size */
9299 0, /* tp_itemsize */
9300 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009301 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009303 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009305 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009306 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009307 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009309 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 (hashfunc) unicode_hash, /* tp_hash*/
9311 0, /* tp_call*/
9312 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009313 PyObject_GenericGetAttr, /* tp_getattro */
9314 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009315 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009316 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9317 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009318 unicode_doc, /* tp_doc */
9319 0, /* tp_traverse */
9320 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009321 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009322 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009323 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009324 0, /* tp_iternext */
9325 unicode_methods, /* tp_methods */
9326 0, /* tp_members */
9327 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009328 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009329 0, /* tp_dict */
9330 0, /* tp_descr_get */
9331 0, /* tp_descr_set */
9332 0, /* tp_dictoffset */
9333 0, /* tp_init */
9334 0, /* tp_alloc */
9335 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009336 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337};
9338
9339/* Initialize the Unicode implementation */
9340
Thomas Wouters78890102000-07-22 19:25:51 +00009341void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009343 int i;
9344
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345 /* XXX - move this array to unicodectype.c ? */
9346 Py_UNICODE linebreak[] = {
9347 0x000A, /* LINE FEED */
9348 0x000D, /* CARRIAGE RETURN */
9349 0x001C, /* FILE SEPARATOR */
9350 0x001D, /* GROUP SEPARATOR */
9351 0x001E, /* RECORD SEPARATOR */
9352 0x0085, /* NEXT LINE */
9353 0x2028, /* LINE SEPARATOR */
9354 0x2029, /* PARAGRAPH SEPARATOR */
9355 };
9356
Fred Drakee4315f52000-05-09 19:53:39 +00009357 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009358 free_list = NULL;
9359 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009361 if (!unicode_empty)
9362 return;
9363
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009364 for (i = 0; i < 256; i++)
9365 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009366 if (PyType_Ready(&PyUnicode_Type) < 0)
9367 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009368
9369 /* initialize the linebreak bloom filter */
9370 bloom_linebreak = make_bloom_mask(
9371 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9372 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009373
9374 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375}
9376
9377/* Finalize the Unicode implementation */
9378
Christian Heimesa156e092008-02-16 07:38:31 +00009379int
9380PyUnicode_ClearFreeList(void)
9381{
9382 int freelist_size = numfree;
9383 PyUnicodeObject *u;
9384
9385 for (u = free_list; u != NULL;) {
9386 PyUnicodeObject *v = u;
9387 u = *(PyUnicodeObject **)u;
9388 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009389 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009390 Py_XDECREF(v->defenc);
9391 PyObject_Del(v);
9392 numfree--;
9393 }
9394 free_list = NULL;
9395 assert(numfree == 0);
9396 return freelist_size;
9397}
9398
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399void
Thomas Wouters78890102000-07-22 19:25:51 +00009400_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009402 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009404 Py_XDECREF(unicode_empty);
9405 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009406
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009407 for (i = 0; i < 256; i++) {
9408 if (unicode_latin1[i]) {
9409 Py_DECREF(unicode_latin1[i]);
9410 unicode_latin1[i] = NULL;
9411 }
9412 }
Christian Heimesa156e092008-02-16 07:38:31 +00009413 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009415
Walter Dörwald16807132007-05-25 13:52:07 +00009416void
9417PyUnicode_InternInPlace(PyObject **p)
9418{
9419 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9420 PyObject *t;
9421 if (s == NULL || !PyUnicode_Check(s))
9422 Py_FatalError(
9423 "PyUnicode_InternInPlace: unicode strings only please!");
9424 /* If it's a subclass, we don't really know what putting
9425 it in the interned dict might do. */
9426 if (!PyUnicode_CheckExact(s))
9427 return;
9428 if (PyUnicode_CHECK_INTERNED(s))
9429 return;
9430 if (interned == NULL) {
9431 interned = PyDict_New();
9432 if (interned == NULL) {
9433 PyErr_Clear(); /* Don't leave an exception */
9434 return;
9435 }
9436 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009437 /* It might be that the GetItem call fails even
9438 though the key is present in the dictionary,
9439 namely when this happens during a stack overflow. */
9440 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009441 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009442 Py_END_ALLOW_RECURSION
9443
Walter Dörwald16807132007-05-25 13:52:07 +00009444 if (t) {
9445 Py_INCREF(t);
9446 Py_DECREF(*p);
9447 *p = t;
9448 return;
9449 }
9450
Martin v. Löwis5b222132007-06-10 09:51:05 +00009451 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009452 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9453 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009454 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009455 return;
9456 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009457 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009458 /* The two references in interned are not counted by refcnt.
9459 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009460 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009461 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9462}
9463
9464void
9465PyUnicode_InternImmortal(PyObject **p)
9466{
9467 PyUnicode_InternInPlace(p);
9468 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9469 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9470 Py_INCREF(*p);
9471 }
9472}
9473
9474PyObject *
9475PyUnicode_InternFromString(const char *cp)
9476{
9477 PyObject *s = PyUnicode_FromString(cp);
9478 if (s == NULL)
9479 return NULL;
9480 PyUnicode_InternInPlace(&s);
9481 return s;
9482}
9483
9484void _Py_ReleaseInternedUnicodeStrings(void)
9485{
9486 PyObject *keys;
9487 PyUnicodeObject *s;
9488 Py_ssize_t i, n;
9489 Py_ssize_t immortal_size = 0, mortal_size = 0;
9490
9491 if (interned == NULL || !PyDict_Check(interned))
9492 return;
9493 keys = PyDict_Keys(interned);
9494 if (keys == NULL || !PyList_Check(keys)) {
9495 PyErr_Clear();
9496 return;
9497 }
9498
9499 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9500 detector, interned unicode strings are not forcibly deallocated;
9501 rather, we give them their stolen references back, and then clear
9502 and DECREF the interned dict. */
9503
9504 n = PyList_GET_SIZE(keys);
9505 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9506 n);
9507 for (i = 0; i < n; i++) {
9508 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9509 switch (s->state) {
9510 case SSTATE_NOT_INTERNED:
9511 /* XXX Shouldn't happen */
9512 break;
9513 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009514 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009515 immortal_size += s->length;
9516 break;
9517 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009518 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009519 mortal_size += s->length;
9520 break;
9521 default:
9522 Py_FatalError("Inconsistent interned string state.");
9523 }
9524 s->state = SSTATE_NOT_INTERNED;
9525 }
9526 fprintf(stderr, "total size of all interned strings: "
9527 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9528 "mortal/immortal\n", mortal_size, immortal_size);
9529 Py_DECREF(keys);
9530 PyDict_Clear(interned);
9531 Py_DECREF(interned);
9532 interned = NULL;
9533}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009534
9535
9536/********************* Unicode Iterator **************************/
9537
9538typedef struct {
9539 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009540 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009541 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9542} unicodeiterobject;
9543
9544static void
9545unicodeiter_dealloc(unicodeiterobject *it)
9546{
9547 _PyObject_GC_UNTRACK(it);
9548 Py_XDECREF(it->it_seq);
9549 PyObject_GC_Del(it);
9550}
9551
9552static int
9553unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9554{
9555 Py_VISIT(it->it_seq);
9556 return 0;
9557}
9558
9559static PyObject *
9560unicodeiter_next(unicodeiterobject *it)
9561{
9562 PyUnicodeObject *seq;
9563 PyObject *item;
9564
9565 assert(it != NULL);
9566 seq = it->it_seq;
9567 if (seq == NULL)
9568 return NULL;
9569 assert(PyUnicode_Check(seq));
9570
9571 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009572 item = PyUnicode_FromUnicode(
9573 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009574 if (item != NULL)
9575 ++it->it_index;
9576 return item;
9577 }
9578
9579 Py_DECREF(seq);
9580 it->it_seq = NULL;
9581 return NULL;
9582}
9583
9584static PyObject *
9585unicodeiter_len(unicodeiterobject *it)
9586{
9587 Py_ssize_t len = 0;
9588 if (it->it_seq)
9589 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009590 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009591}
9592
9593PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9594
9595static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009596 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9597 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009598 {NULL, NULL} /* sentinel */
9599};
9600
9601PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009602 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009603 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009604 sizeof(unicodeiterobject), /* tp_basicsize */
9605 0, /* tp_itemsize */
9606 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009607 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009608 0, /* tp_print */
9609 0, /* tp_getattr */
9610 0, /* tp_setattr */
9611 0, /* tp_compare */
9612 0, /* tp_repr */
9613 0, /* tp_as_number */
9614 0, /* tp_as_sequence */
9615 0, /* tp_as_mapping */
9616 0, /* tp_hash */
9617 0, /* tp_call */
9618 0, /* tp_str */
9619 PyObject_GenericGetAttr, /* tp_getattro */
9620 0, /* tp_setattro */
9621 0, /* tp_as_buffer */
9622 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9623 0, /* tp_doc */
9624 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9625 0, /* tp_clear */
9626 0, /* tp_richcompare */
9627 0, /* tp_weaklistoffset */
9628 PyObject_SelfIter, /* tp_iter */
9629 (iternextfunc)unicodeiter_next, /* tp_iternext */
9630 unicodeiter_methods, /* tp_methods */
9631 0,
9632};
9633
9634static PyObject *
9635unicode_iter(PyObject *seq)
9636{
9637 unicodeiterobject *it;
9638
9639 if (!PyUnicode_Check(seq)) {
9640 PyErr_BadInternalCall();
9641 return NULL;
9642 }
9643 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9644 if (it == NULL)
9645 return NULL;
9646 it->it_index = 0;
9647 Py_INCREF(seq);
9648 it->it_seq = (PyUnicodeObject *)seq;
9649 _PyObject_GC_TRACK(it);
9650 return (PyObject *)it;
9651}
9652
Martin v. Löwis5b222132007-06-10 09:51:05 +00009653size_t
9654Py_UNICODE_strlen(const Py_UNICODE *u)
9655{
9656 int res = 0;
9657 while(*u++)
9658 res++;
9659 return res;
9660}
9661
9662Py_UNICODE*
9663Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9664{
9665 Py_UNICODE *u = s1;
9666 while ((*u++ = *s2++));
9667 return s1;
9668}
9669
9670Py_UNICODE*
9671Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9672{
9673 Py_UNICODE *u = s1;
9674 while ((*u++ = *s2++))
9675 if (n-- == 0)
9676 break;
9677 return s1;
9678}
9679
9680int
9681Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9682{
9683 while (*s1 && *s2 && *s1 == *s2)
9684 s1++, s2++;
9685 if (*s1 && *s2)
9686 return (*s1 < *s2) ? -1 : +1;
9687 if (*s1)
9688 return 1;
9689 if (*s2)
9690 return -1;
9691 return 0;
9692}
9693
9694Py_UNICODE*
9695Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9696{
9697 const Py_UNICODE *p;
9698 for (p = s; *p; p++)
9699 if (*p == c)
9700 return (Py_UNICODE*)p;
9701 return NULL;
9702}
9703
9704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009705#ifdef __cplusplus
9706}
9707#endif
9708
9709
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009710/*
9711Local variables:
9712c-basic-offset: 4
9713indent-tabs-mode: nil
9714End:
9715*/