blob: 2beb0984b45a64eb5816a8a4cb697c16c6a38cd8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Martin v. Löwis18e16552006-02-15 17:27:45 +0000421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422{
423 register PyUnicodeObject *v;
424
425 /* Argument checks */
426 if (unicode == NULL) {
427 PyErr_BadInternalCall();
428 return -1;
429 }
430 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000431 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432 PyErr_BadInternalCall();
433 return -1;
434 }
435
436 /* Resizing unicode_empty and single character objects is not
437 possible since these are being shared. We simply return a fresh
438 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000439 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 (v == unicode_empty || v->length == 1)) {
441 PyUnicodeObject *w = _PyUnicode_New(length);
442 if (w == NULL)
443 return -1;
444 Py_UNICODE_COPY(w->str, v->str,
445 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000446 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 *unicode = (PyObject *)w;
448 return 0;
449 }
450
451 /* Note that we don't have to modify *unicode for unshared Unicode
452 objects, since we can modify them in-place. */
453 return unicode_resize(v, length);
454}
455
456/* Internal API for use in unicodeobject.c only ! */
457#define _PyUnicode_Resize(unicodevar, length) \
458 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
459
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000461 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462{
463 PyUnicodeObject *unicode;
464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000465 /* If the Unicode data is known at construction time, we can apply
466 some optimizations which share commonly used objects. */
467 if (u != NULL) {
468
469 /* Optimization for empty strings */
470 if (size == 0 && unicode_empty != NULL) {
471 Py_INCREF(unicode_empty);
472 return (PyObject *)unicode_empty;
473 }
474
475 /* Single character Unicode objects in the Latin-1 range are
476 shared when using this constructor */
477 if (size == 1 && *u < 256) {
478 unicode = unicode_latin1[*u];
479 if (!unicode) {
480 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 if (!unicode)
482 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000483 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484 unicode_latin1[*u] = unicode;
485 }
486 Py_INCREF(unicode);
487 return (PyObject *)unicode;
488 }
489 }
Tim Petersced69f82003-09-16 20:30:58 +0000490
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the Unicode data into the new object */
496 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000497 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498
499 return (PyObject *)unicode;
500}
501
Walter Dörwaldd2034312007-05-18 16:29:38 +0000502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000503{
504 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000505
506 if (size < 0) {
507 PyErr_SetString(PyExc_SystemError,
508 "Negative size passed to PyUnicode_FromStringAndSize");
509 return NULL;
510 }
511
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000513 some optimizations which share commonly used objects.
514 Also, this means the input must be UTF-8, so fall back to the
515 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (u != NULL) {
517
518 /* Optimization for empty strings */
519 if (size == 0 && unicode_empty != NULL) {
520 Py_INCREF(unicode_empty);
521 return (PyObject *)unicode_empty;
522 }
523
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 /* Single characters are shared when using this constructor.
525 Restrict to ASCII, since the input must be UTF-8. */
526 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000527 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (!unicode) {
529 unicode = _PyUnicode_New(1);
530 if (!unicode)
531 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000532 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000533 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 }
535 Py_INCREF(unicode);
536 return (PyObject *)unicode;
537 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000538
539 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 }
541
Walter Dörwald55507312007-05-18 13:12:10 +0000542 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000543 if (!unicode)
544 return NULL;
545
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000546 return (PyObject *)unicode;
547}
548
Walter Dörwaldd2034312007-05-18 16:29:38 +0000549PyObject *PyUnicode_FromString(const char *u)
550{
551 size_t size = strlen(u);
552 if (size > PY_SSIZE_T_MAX) {
553 PyErr_SetString(PyExc_OverflowError, "input too long");
554 return NULL;
555 }
556
557 return PyUnicode_FromStringAndSize(u, size);
558}
559
Guido van Rossumd57fd912000-03-10 22:53:23 +0000560#ifdef HAVE_WCHAR_H
561
562PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000563 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564{
565 PyUnicodeObject *unicode;
566
567 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000568 if (size == 0)
569 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
Martin v. Löwis790465f2008-04-05 20:41:37 +0000574 if (size == -1) {
575 size = wcslen(w);
576 }
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578 unicode = _PyUnicode_New(size);
579 if (!unicode)
580 return NULL;
581
582 /* Copy the wchar_t data into the new object */
583#ifdef HAVE_USABLE_WCHAR_T
584 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000585#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 {
587 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000590 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 *u++ = *w++;
592 }
593#endif
594
595 return (PyObject *)unicode;
596}
597
Walter Dörwald346737f2007-05-31 10:44:43 +0000598static void
599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
600{
601 *fmt++ = '%';
602 if (width) {
603 if (zeropad)
604 *fmt++ = '0';
605 fmt += sprintf(fmt, "%d", width);
606 }
607 if (precision)
608 fmt += sprintf(fmt, ".%d", precision);
609 if (longflag)
610 *fmt++ = 'l';
611 else if (size_tflag) {
612 char *f = PY_FORMAT_SIZE_T;
613 while (*f)
614 *fmt++ = *f++;
615 }
616 *fmt++ = c;
617 *fmt = '\0';
618}
619
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
621
622PyObject *
623PyUnicode_FromFormatV(const char *format, va_list vargs)
624{
625 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000626 Py_ssize_t callcount = 0;
627 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000628 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000629 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000630 int width = 0;
631 int precision = 0;
632 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000633 const char* f;
634 Py_UNICODE *s;
635 PyObject *string;
636 /* used by sprintf */
637 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000638 /* use abuffer instead of buffer, if we need more space
639 * (which can happen if there's a format specifier with width). */
640 char *abuffer = NULL;
641 char *realbuffer;
642 Py_ssize_t abuffersize = 0;
643 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000644 const char *copy;
645
646#ifdef VA_LIST_IS_ARRAY
647 Py_MEMCPY(count, vargs, sizeof(va_list));
648#else
649#ifdef __va_copy
650 __va_copy(count, vargs);
651#else
652 count = vargs;
653#endif
654#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 /* step 1: count the number of %S/%R/%A format specifications
656 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
657 * these objects once during step 3 and put the result in
658 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000660 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 ++callcount;
662 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000663 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000664 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000665 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000666 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (!callresults) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 callresult = callresults;
672 }
673 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 for (f = format; *f; f++) {
675 if (*f == '%') {
676 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000677 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000678 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681 ;
682
683 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
684 * they don't affect the amount of space we reserve.
685 */
686 if ((*f == 'l' || *f == 'z') &&
687 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000688 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000689
690 switch (*f) {
691 case 'c':
692 (void)va_arg(count, int);
693 /* fall through... */
694 case '%':
695 n++;
696 break;
697 case 'd': case 'u': case 'i': case 'x':
698 (void) va_arg(count, int);
699 /* 20 bytes is enough to hold a 64-bit
700 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000701 This isn't enough for octal.
702 If a width is specified we need more
703 (which we allocate later). */
704 if (width < 20)
705 width = 20;
706 n += width;
707 if (abuffersize < width)
708 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 break;
710 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000711 {
712 /* UTF-8 */
713 unsigned char*s;
714 s = va_arg(count, unsigned char*);
715 while (*s) {
716 if (*s < 128) {
717 n++; s++;
718 } else if (*s < 0xc0) {
719 /* invalid UTF-8 */
720 n++; s++;
721 } else if (*s < 0xc0) {
722 n++;
723 s++; if(!*s)break;
724 s++;
725 } else if (*s < 0xe0) {
726 n++;
727 s++; if(!*s)break;
728 s++; if(!*s)break;
729 s++;
730 } else {
731 #ifdef Py_UNICODE_WIDE
732 n++;
733 #else
734 n+=2;
735 #endif
736 s++; if(!*s)break;
737 s++; if(!*s)break;
738 s++; if(!*s)break;
739 s++;
740 }
741 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 case 'U':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 assert(obj && PyUnicode_Check(obj));
748 n += PyUnicode_GET_SIZE(obj);
749 break;
750 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000751 case 'V':
752 {
753 PyObject *obj = va_arg(count, PyObject *);
754 const char *str = va_arg(count, const char *);
755 assert(obj || str);
756 assert(!obj || PyUnicode_Check(obj));
757 if (obj)
758 n += PyUnicode_GET_SIZE(obj);
759 else
760 n += strlen(str);
761 break;
762 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 case 'S':
764 {
765 PyObject *obj = va_arg(count, PyObject *);
766 PyObject *str;
767 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000768 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000769 if (!str)
770 goto fail;
771 n += PyUnicode_GET_SIZE(str);
772 /* Remember the str and switch to the next slot */
773 *callresult++ = str;
774 break;
775 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000776 case 'R':
777 {
778 PyObject *obj = va_arg(count, PyObject *);
779 PyObject *repr;
780 assert(obj);
781 repr = PyObject_Repr(obj);
782 if (!repr)
783 goto fail;
784 n += PyUnicode_GET_SIZE(repr);
785 /* Remember the repr and switch to the next slot */
786 *callresult++ = repr;
787 break;
788 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000789 case 'A':
790 {
791 PyObject *obj = va_arg(count, PyObject *);
792 PyObject *ascii;
793 assert(obj);
794 ascii = PyObject_ASCII(obj);
795 if (!ascii)
796 goto fail;
797 n += PyUnicode_GET_SIZE(ascii);
798 /* Remember the repr and switch to the next slot */
799 *callresult++ = ascii;
800 break;
801 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 case 'p':
803 (void) va_arg(count, int);
804 /* maximum 64-bit pointer representation:
805 * 0xffffffffffffffff
806 * so 19 characters is enough.
807 * XXX I count 18 -- what's the extra for?
808 */
809 n += 19;
810 break;
811 default:
812 /* if we stumble upon an unknown
813 formatting code, copy the rest of
814 the format string to the output
815 string. (we cannot just skip the
816 code, since there's no way to know
817 what's in the argument list) */
818 n += strlen(p);
819 goto expand;
820 }
821 } else
822 n++;
823 }
824 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000825 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000826 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (!abuffer) {
828 PyErr_NoMemory();
829 goto fail;
830 }
831 realbuffer = abuffer;
832 }
833 else
834 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000835 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 we don't have to resize the string.
838 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000839 string = PyUnicode_FromUnicode(NULL, n);
840 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842
843 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845
846 for (f = format; *f; f++) {
847 if (*f == '%') {
848 const char* p = f++;
849 int longflag = 0;
850 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 zeropad = (*f == '0');
852 /* parse the width.precision part */
853 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 width = (width*10) + *f++ - '0';
856 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 if (*f == '.') {
858 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000859 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 /* handle the long flag, but only for %ld and %lu.
863 others can be added when necessary. */
864 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
865 longflag = 1;
866 ++f;
867 }
868 /* handle the size_t flag. */
869 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
870 size_tflag = 1;
871 ++f;
872 }
873
874 switch (*f) {
875 case 'c':
876 *s++ = va_arg(vargs, int);
877 break;
878 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000879 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000880 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, int));
886 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
888 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000889 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000890 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000904 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
905 sprintf(realbuffer, fmt, va_arg(vargs, int));
906 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907 break;
908 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000909 {
910 /* Parameter must be UTF-8 encoded.
911 In case of encoding errors, use
912 the replacement character. */
913 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000914 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000915 u = PyUnicode_DecodeUTF8(p, strlen(p),
916 "replace");
917 if (!u)
918 goto fail;
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
920 PyUnicode_GET_SIZE(u));
921 s += PyUnicode_GET_SIZE(u);
922 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000923 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000924 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 case 'U':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000928 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
929 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
930 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000931 break;
932 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000933 case 'V':
934 {
935 PyObject *obj = va_arg(vargs, PyObject *);
936 const char *str = va_arg(vargs, const char *);
937 if (obj) {
938 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
939 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
940 s += size;
941 } else {
942 appendstring(str);
943 }
944 break;
945 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000946 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 case 'R':
948 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 Py_UNICODE *ucopy;
950 Py_ssize_t usize;
951 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 /* unused, since we already have the result */
953 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000954 ucopy = PyUnicode_AS_UNICODE(*callresult);
955 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 for (upos = 0; upos<usize;)
957 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000958 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000959 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 ++callresult;
962 break;
963 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000964 case 'p':
965 sprintf(buffer, "%p", va_arg(vargs, void*));
966 /* %p is ill-defined: ensure leading 0x. */
967 if (buffer[1] == 'X')
968 buffer[1] = 'x';
969 else if (buffer[1] != 'x') {
970 memmove(buffer+2, buffer, strlen(buffer)+1);
971 buffer[0] = '0';
972 buffer[1] = 'x';
973 }
974 appendstring(buffer);
975 break;
976 case '%':
977 *s++ = '%';
978 break;
979 default:
980 appendstring(p);
981 goto end;
982 }
983 } else
984 *s++ = *f;
985 }
986
987 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000988 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000989 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000990 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000992 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
993 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000994 fail:
995 if (callresults) {
996 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000997 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000998 Py_DECREF(*callresult2);
999 ++callresult2;
1000 }
Christian Heimesb186d002008-03-18 15:15:01 +00001001 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001002 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001003 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001004 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001005 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001006}
1007
1008#undef appendstring
1009
1010PyObject *
1011PyUnicode_FromFormat(const char *format, ...)
1012{
1013 PyObject* ret;
1014 va_list vargs;
1015
1016#ifdef HAVE_STDARG_PROTOTYPES
1017 va_start(vargs, format);
1018#else
1019 va_start(vargs);
1020#endif
1021 ret = PyUnicode_FromFormatV(format, vargs);
1022 va_end(vargs);
1023 return ret;
1024}
1025
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1027 wchar_t *w,
1028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029{
1030 if (unicode == NULL) {
1031 PyErr_BadInternalCall();
1032 return -1;
1033 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001034
1035 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037 size = PyUnicode_GET_SIZE(unicode) + 1;
1038
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039#ifdef HAVE_USABLE_WCHAR_T
1040 memcpy(w, unicode->str, size * sizeof(wchar_t));
1041#else
1042 {
1043 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001044 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001046 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 *w++ = *u++;
1048 }
1049#endif
1050
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001051 if (size > PyUnicode_GET_SIZE(unicode))
1052 return PyUnicode_GET_SIZE(unicode);
1053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 return size;
1055}
1056
1057#endif
1058
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059PyObject *PyUnicode_FromOrdinal(int ordinal)
1060{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001061 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001063 if (ordinal < 0 || ordinal > 0x10ffff) {
1064 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001065 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001066 return NULL;
1067 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001068
1069#ifndef Py_UNICODE_WIDE
1070 if (ordinal > 0xffff) {
1071 ordinal -= 0x10000;
1072 s[0] = 0xD800 | (ordinal >> 10);
1073 s[1] = 0xDC00 | (ordinal & 0x3FF);
1074 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075 }
1076#endif
1077
Hye-Shik Chang40574832004-04-06 07:24:51 +00001078 s[0] = (Py_UNICODE)ordinal;
1079 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001080}
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082PyObject *PyUnicode_FromObject(register PyObject *obj)
1083{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001085 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 if (PyUnicode_CheckExact(obj)) {
1087 Py_INCREF(obj);
1088 return obj;
1089 }
1090 if (PyUnicode_Check(obj)) {
1091 /* For a Unicode subtype that's not a Unicode object,
1092 return a true Unicode object with the same data. */
1093 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1094 PyUnicode_GET_SIZE(obj));
1095 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001096 PyErr_Format(PyExc_TypeError,
1097 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001098 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100}
1101
1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1103 const char *encoding,
1104 const char *errors)
1105{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001107 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (obj == NULL) {
1111 PyErr_BadInternalCall();
1112 return NULL;
1113 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115 if (PyUnicode_Check(obj)) {
1116 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001117 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120
1121 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001122 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001123 s = PyBytes_AS_STRING(obj);
1124 len = PyBytes_GET_SIZE(obj);
1125 }
1126 else if (PyByteArray_Check(obj)) {
1127 s = PyByteArray_AS_STRING(obj);
1128 len = PyByteArray_GET_SIZE(obj);
1129 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001130 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1131 /* Overwrite the error message with something more useful in
1132 case of a TypeError. */
1133 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001134 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001135 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001137 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001138 goto onError;
1139 }
Tim Petersced69f82003-09-16 20:30:58 +00001140
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001141 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 if (len == 0) {
1143 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 }
Tim Petersced69f82003-09-16 20:30:58 +00001146 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001148
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 return v;
1150
1151 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153}
1154
1155PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 const char *encoding,
1158 const char *errors)
1159{
1160 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001161 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 char lower[20]; /* Enough for any encoding name we recognize */
1163 char *l;
1164 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165
1166 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Convert encoding to lower case and replace '_' with '-' in order to
1170 catch e.g. UTF_8 */
1171 e = encoding;
1172 l = lower;
1173 while (*e && l < &lower[(sizeof lower) - 2]) {
1174 if (ISUPPER(*e)) {
1175 *l++ = TOLOWER(*e++);
1176 }
1177 else if (*e == '_') {
1178 *l++ = '-';
1179 e++;
1180 }
1181 else {
1182 *l++ = *e++;
1183 }
1184 }
1185 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001186
1187 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001188 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 else if ((strcmp(lower, "latin-1") == 0) ||
1191 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001197 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "utf-16") == 0)
1200 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1201 else if (strcmp(lower, "utf-32") == 0)
1202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203
1204 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001205 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001208 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (buffer == NULL)
1210 goto onError;
1211 unicode = PyCodec_Decode(buffer, encoding, errors);
1212 if (unicode == NULL)
1213 goto onError;
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001216 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001217 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_DECREF(unicode);
1219 goto onError;
1220 }
1221 Py_DECREF(buffer);
1222 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 onError:
1225 Py_XDECREF(buffer);
1226 return NULL;
1227}
1228
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1230 const char *encoding,
1231 const char *errors)
1232{
1233 PyObject *v;
1234
1235 if (!PyUnicode_Check(unicode)) {
1236 PyErr_BadArgument();
1237 goto onError;
1238 }
1239
1240 if (encoding == NULL)
1241 encoding = PyUnicode_GetDefaultEncoding();
1242
1243 /* Decode via the codec registry */
1244 v = PyCodec_Decode(unicode, encoding, errors);
1245 if (v == NULL)
1246 goto onError;
1247 return v;
1248
1249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1254 const char *encoding,
1255 const char *errors)
1256{
1257 PyObject *v;
1258
1259 if (!PyUnicode_Check(unicode)) {
1260 PyErr_BadArgument();
1261 goto onError;
1262 }
1263
1264 if (encoding == NULL)
1265 encoding = PyUnicode_GetDefaultEncoding();
1266
1267 /* Decode via the codec registry */
1268 v = PyCodec_Decode(unicode, encoding, errors);
1269 if (v == NULL)
1270 goto onError;
1271 if (!PyUnicode_Check(v)) {
1272 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001273 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001274 Py_TYPE(v)->tp_name);
1275 Py_DECREF(v);
1276 goto onError;
1277 }
1278 return v;
1279
1280 onError:
1281 return NULL;
1282}
1283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 unicode = PyUnicode_FromUnicode(s, size);
1292 if (unicode == NULL)
1293 return NULL;
1294 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1295 Py_DECREF(unicode);
1296 return v;
1297}
1298
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1300 const char *encoding,
1301 const char *errors)
1302{
1303 PyObject *v;
1304
1305 if (!PyUnicode_Check(unicode)) {
1306 PyErr_BadArgument();
1307 goto onError;
1308 }
1309
1310 if (encoding == NULL)
1311 encoding = PyUnicode_GetDefaultEncoding();
1312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 return v;
1318
1319 onError:
1320 return NULL;
1321}
1322
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1324 const char *encoding,
1325 const char *errors)
1326{
1327 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 if (!PyUnicode_Check(unicode)) {
1330 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 }
Fred Drakee4315f52000-05-09 19:53:39 +00001333
Tim Petersced69f82003-09-16 20:30:58 +00001334 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001335 encoding = PyUnicode_GetDefaultEncoding();
1336
1337 /* Shortcuts for common default encodings */
1338 if (errors == NULL) {
1339 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001340 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001341 else if (strcmp(encoding, "latin-1") == 0)
1342 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1344 else if (strcmp(encoding, "mbcs") == 0)
1345 return PyUnicode_AsMBCSString(unicode);
1346#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001347 else if (strcmp(encoding, "ascii") == 0)
1348 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001349 /* During bootstrap, we may need to find the encodings
1350 package, to load the file system encoding, and require the
1351 file system encoding in order to load the encodings
1352 package.
1353
1354 Break out of this dependency by assuming that the path to
1355 the encodings module is ASCII-only. XXX could try wcstombs
1356 instead, if the file system encoding is the locale's
1357 encoding. */
1358 else if (Py_FileSystemDefaultEncoding &&
1359 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1360 !PyThreadState_GET()->interp->codecs_initialized)
1361 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363
1364 /* Encode via the codec registry */
1365 v = PyCodec_Encode(unicode, encoding, errors);
1366 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001367 return NULL;
1368
1369 /* The normal path */
1370 if (PyBytes_Check(v))
1371 return v;
1372
1373 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001374 if (PyByteArray_Check(v)) {
1375 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001376 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001377 PyOS_snprintf(msg, sizeof(msg),
1378 "encoder %s returned buffer instead of bytes",
1379 encoding);
1380 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001381 Py_DECREF(v);
1382 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001383 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001384
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001385 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1386 Py_DECREF(v);
1387 return b;
1388 }
1389
1390 PyErr_Format(PyExc_TypeError,
1391 "encoder did not return a bytes object (type=%.400s)",
1392 Py_TYPE(v)->tp_name);
1393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394 return NULL;
1395}
1396
1397PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1398 const char *encoding,
1399 const char *errors)
1400{
1401 PyObject *v;
1402
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407
1408 if (encoding == NULL)
1409 encoding = PyUnicode_GetDefaultEncoding();
1410
1411 /* Encode via the codec registry */
1412 v = PyCodec_Encode(unicode, encoding, errors);
1413 if (v == NULL)
1414 goto onError;
1415 if (!PyUnicode_Check(v)) {
1416 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001417 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001418 Py_TYPE(v)->tp_name);
1419 Py_DECREF(v);
1420 goto onError;
1421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001423
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 onError:
1425 return NULL;
1426}
1427
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001428PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1429 const char *errors)
1430{
1431 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001432 if (v)
1433 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001434 if (errors != NULL)
1435 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001436 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001437 PyUnicode_GET_SIZE(unicode),
1438 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001439 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001440 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001441 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001442 return v;
1443}
1444
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001445PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001446PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001448 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1449}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001450
Christian Heimes5894ba72007-11-04 11:43:14 +00001451PyObject*
1452PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1453{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001454 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1455 can be undefined. If it is case, decode using UTF-8. The following assumes
1456 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1457 bootstrapping process where the codecs aren't ready yet.
1458 */
1459 if (Py_FileSystemDefaultEncoding) {
1460#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001461 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001462 return PyUnicode_DecodeMBCS(s, size, "replace");
1463 }
1464#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001465 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001466 return PyUnicode_DecodeUTF8(s, size, "replace");
1467 }
1468#endif
1469 return PyUnicode_Decode(s, size,
1470 Py_FileSystemDefaultEncoding,
1471 "replace");
1472 }
1473 else {
1474 return PyUnicode_DecodeUTF8(s, size, "replace");
1475 }
1476}
1477
Martin v. Löwis5b222132007-06-10 09:51:05 +00001478char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001479_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480{
Christian Heimesf3863112007-11-22 07:46:41 +00001481 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001482 if (!PyUnicode_Check(unicode)) {
1483 PyErr_BadArgument();
1484 return NULL;
1485 }
Christian Heimesf3863112007-11-22 07:46:41 +00001486 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1487 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001488 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001489 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001490 *psize = PyBytes_GET_SIZE(bytes);
1491 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001492}
1493
1494char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001495_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001496{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001498}
1499
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1501{
1502 if (!PyUnicode_Check(unicode)) {
1503 PyErr_BadArgument();
1504 goto onError;
1505 }
1506 return PyUnicode_AS_UNICODE(unicode);
1507
1508 onError:
1509 return NULL;
1510}
1511
Martin v. Löwis18e16552006-02-15 17:27:45 +00001512Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513{
1514 if (!PyUnicode_Check(unicode)) {
1515 PyErr_BadArgument();
1516 goto onError;
1517 }
1518 return PyUnicode_GET_SIZE(unicode);
1519
1520 onError:
1521 return -1;
1522}
1523
Thomas Wouters78890102000-07-22 19:25:51 +00001524const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001525{
1526 return unicode_default_encoding;
1527}
1528
1529int PyUnicode_SetDefaultEncoding(const char *encoding)
1530{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001531 if (strcmp(encoding, unicode_default_encoding) != 0) {
1532 PyErr_Format(PyExc_ValueError,
1533 "Can only set default encoding to %s",
1534 unicode_default_encoding);
1535 return -1;
1536 }
Fred Drakee4315f52000-05-09 19:53:39 +00001537 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001538}
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540/* error handling callback helper:
1541 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001542 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 and adjust various state variables.
1544 return 0 on success, -1 on error
1545*/
1546
1547static
1548int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1549 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001550 const char **input, const char **inend, Py_ssize_t *startinpos,
1551 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001552 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001554 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555
1556 PyObject *restuple = NULL;
1557 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001558 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001559 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t requiredsize;
1561 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001562 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001563 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001565 int res = -1;
1566
1567 if (*errorHandler == NULL) {
1568 *errorHandler = PyCodec_LookupError(errors);
1569 if (*errorHandler == NULL)
1570 goto onError;
1571 }
1572
1573 if (*exceptionObject == NULL) {
1574 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001575 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 if (*exceptionObject == NULL)
1577 goto onError;
1578 }
1579 else {
1580 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1581 goto onError;
1582 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1585 goto onError;
1586 }
1587
1588 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1589 if (restuple == NULL)
1590 goto onError;
1591 if (!PyTuple_Check(restuple)) {
1592 PyErr_Format(PyExc_TypeError, &argparse[4]);
1593 goto onError;
1594 }
1595 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1596 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001597
1598 /* Copy back the bytes variables, which might have been modified by the
1599 callback */
1600 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1601 if (!inputobj)
1602 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001603 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001604 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1605 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001606 *input = PyBytes_AS_STRING(inputobj);
1607 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001608 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001609 /* we can DECREF safely, as the exception has another reference,
1610 so the object won't go away. */
1611 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001614 newpos = insize+newpos;
1615 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001616 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001617 goto onError;
1618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619
1620 /* need more space? (at least enough for what we
1621 have+the replacement+the rest of the string (starting
1622 at the new input position), so we won't have to check space
1623 when there are no errors in the rest of the string) */
1624 repptr = PyUnicode_AS_UNICODE(repunicode);
1625 repsize = PyUnicode_GET_SIZE(repunicode);
1626 requiredsize = *outpos + repsize + insize-newpos;
1627 if (requiredsize > outsize) {
1628 if (requiredsize<2*outsize)
1629 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001630 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 goto onError;
1632 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1633 }
1634 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001635 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 Py_UNICODE_COPY(*outptr, repptr, repsize);
1637 *outptr += repsize;
1638 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 /* we made it! */
1641 res = 0;
1642
1643 onError:
1644 Py_XDECREF(restuple);
1645 return res;
1646}
1647
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648/* --- UTF-7 Codec -------------------------------------------------------- */
1649
1650/* see RFC2152 for details */
1651
Tim Petersced69f82003-09-16 20:30:58 +00001652static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653char utf7_special[128] = {
1654 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1655 encoded:
1656 0 - not special
1657 1 - special
1658 2 - whitespace (optional)
1659 3 - RFC2152 Set O (optional) */
1660 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1664 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1668
1669};
1670
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001671/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1672 warnings about the comparison always being false; since
1673 utf7_special[0] is 1, we can safely make that one comparison
1674 true */
1675
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001677 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001678 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 (encodeO && (utf7_special[(c)] == 3)))
1680
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001681#define B64(n) \
1682 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1683#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001684 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001685#define UB64(c) \
1686 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1687 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001688
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001689#define ENCODE(out, ch, bits) \
1690 while (bits >= 6) { \
1691 *out++ = B64(ch >> (bits-6)); \
1692 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 }
1694
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001695#define DECODE(out, ch, bits, surrogate) \
1696 while (bits >= 16) { \
1697 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1698 bits -= 16; \
1699 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001700 /* We have already generated an error for the high surrogate \
1701 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001702 surrogate = 0; \
1703 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001705 it in a 16-bit character */ \
1706 surrogate = 1; \
1707 errmsg = "code pairs are not supported"; \
1708 goto utf7Error; \
1709 } else { \
1710 *out++ = outCh; \
1711 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001712 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001715 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 const char *errors)
1717{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001718 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1719}
1720
1721PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1722 Py_ssize_t size,
1723 const char *errors,
1724 Py_ssize_t *consumed)
1725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001727 Py_ssize_t startinpos;
1728 Py_ssize_t endinpos;
1729 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 const char *e;
1731 PyUnicodeObject *unicode;
1732 Py_UNICODE *p;
1733 const char *errmsg = "";
1734 int inShift = 0;
1735 unsigned int bitsleft = 0;
1736 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 int surrogate = 0;
1738 PyObject *errorHandler = NULL;
1739 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740
1741 unicode = _PyUnicode_New(size);
1742 if (!unicode)
1743 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001744 if (size == 0) {
1745 if (consumed)
1746 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749
1750 p = unicode->str;
1751 e = s + size;
1752
1753 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 Py_UNICODE ch;
1755 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001756 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757
1758 if (inShift) {
1759 if ((ch == '-') || !B64CHAR(ch)) {
1760 inShift = 0;
1761 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001762
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1764 if (bitsleft >= 6) {
1765 /* The shift sequence has a partial character in it. If
1766 bitsleft < 6 then we could just classify it as padding
1767 but that is not the case here */
1768
1769 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001770 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 }
1772 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001773 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001774 here so indicate the potential of a misencoded character. */
1775
1776 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1777 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1778 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001779 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780 }
1781
1782 if (ch == '-') {
1783 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001784 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 inShift = 1;
1786 }
1787 } else if (SPECIAL(ch,0,0)) {
1788 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001789 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 } else {
1791 *p++ = ch;
1792 }
1793 } else {
1794 charsleft = (charsleft << 6) | UB64(ch);
1795 bitsleft += 6;
1796 s++;
1797 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1798 }
1799 }
1800 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802 s++;
1803 if (s < e && *s == '-') {
1804 s++;
1805 *p++ = '+';
1806 } else
1807 {
1808 inShift = 1;
1809 bitsleft = 0;
1810 }
1811 }
1812 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001813 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814 errmsg = "unexpected special character";
1815 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 }
1818 else {
1819 *p++ = ch;
1820 s++;
1821 }
1822 continue;
1823 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 outpos = p-PyUnicode_AS_UNICODE(unicode);
1825 endinpos = s-starts;
1826 if (unicode_decode_call_errorhandler(
1827 errors, &errorHandler,
1828 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001829 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 (PyObject **)&unicode, &outpos, &p))
1831 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 }
1833
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001834 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 outpos = p-PyUnicode_AS_UNICODE(unicode);
1836 endinpos = size;
1837 if (unicode_decode_call_errorhandler(
1838 errors, &errorHandler,
1839 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001840 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 if (s < e)
1844 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001846 if (consumed) {
1847 if(inShift)
1848 *consumed = startinpos;
1849 else
1850 *consumed = s-starts;
1851 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001853 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854 goto onError;
1855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 Py_XDECREF(errorHandler);
1857 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 return (PyObject *)unicode;
1859
1860onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 Py_XDECREF(errorHandler);
1862 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001863 Py_DECREF(unicode);
1864 return NULL;
1865}
1866
1867
1868PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001869 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 int encodeSetO,
1871 int encodeWhiteSpace,
1872 const char *errors)
1873{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001874 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001875 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001876 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 unsigned int bitsleft = 0;
1880 unsigned long charsleft = 0;
1881 char * out;
1882 char * start;
1883
1884 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001885 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001886
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001887 if (cbAllocated / 5 != size)
1888 return PyErr_NoMemory();
1889
Christian Heimes9c4756e2008-05-26 13:22:05 +00001890 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001891 if (v == NULL)
1892 return NULL;
1893
Christian Heimes9c4756e2008-05-26 13:22:05 +00001894 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001895 for (;i < size; ++i) {
1896 Py_UNICODE ch = s[i];
1897
1898 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001899 if (ch == '+') {
1900 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901 *out++ = '-';
1902 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1903 charsleft = ch;
1904 bitsleft = 16;
1905 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001906 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001907 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 } else {
1909 *out++ = (char) ch;
1910 }
1911 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1913 *out++ = B64(charsleft << (6-bitsleft));
1914 charsleft = 0;
1915 bitsleft = 0;
1916 /* Characters not in the BASE64 set implicitly unshift the sequence
1917 so no '-' is required, except if the character is itself a '-' */
1918 if (B64CHAR(ch) || ch == '-') {
1919 *out++ = '-';
1920 }
1921 inShift = 0;
1922 *out++ = (char) ch;
1923 } else {
1924 bitsleft += 16;
1925 charsleft = (charsleft << 16) | ch;
1926 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1927
1928 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001929 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001930 or '-' then the shift sequence will be terminated implicitly and we
1931 don't have to insert a '-'. */
1932
1933 if (bitsleft == 0) {
1934 if (i + 1 < size) {
1935 Py_UNICODE ch2 = s[i+1];
1936
1937 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001938
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 } else if (B64CHAR(ch2) || ch2 == '-') {
1940 *out++ = '-';
1941 inShift = 0;
1942 } else {
1943 inShift = 0;
1944 }
1945
1946 }
1947 else {
1948 *out++ = '-';
1949 inShift = 0;
1950 }
1951 }
Tim Petersced69f82003-09-16 20:30:58 +00001952 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001953 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 if (bitsleft) {
1956 *out++= B64(charsleft << (6-bitsleft) );
1957 *out++ = '-';
1958 }
1959
Christian Heimes72b710a2008-05-26 13:28:38 +00001960 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001961 Py_DECREF(v);
1962 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001963}
1964
1965#undef SPECIAL
1966#undef B64
1967#undef B64CHAR
1968#undef UB64
1969#undef ENCODE
1970#undef DECODE
1971
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972/* --- UTF-8 Codec -------------------------------------------------------- */
1973
Tim Petersced69f82003-09-16 20:30:58 +00001974static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975char utf8_code_length[256] = {
1976 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1977 illegal prefix. see RFC 2279 for details */
1978 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1993 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1994};
1995
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 const char *errors)
1999{
Walter Dörwald69652032004-09-07 20:24:22 +00002000 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2001}
2002
2003PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002005 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002010 Py_ssize_t startinpos;
2011 Py_ssize_t endinpos;
2012 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 const char *e;
2014 PyUnicodeObject *unicode;
2015 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002016 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 PyObject *errorHandler = NULL;
2018 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 /* Note: size will always be longer than the resulting Unicode
2021 character count */
2022 unicode = _PyUnicode_New(size);
2023 if (!unicode)
2024 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002025 if (size == 0) {
2026 if (consumed)
2027 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
2031 /* Unpack UTF-8 encoded data */
2032 p = unicode->str;
2033 e = s + size;
2034
2035 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002036 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002039 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 s++;
2041 continue;
2042 }
2043
2044 n = utf8_code_length[ch];
2045
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002046 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002047 if (consumed)
2048 break;
2049 else {
2050 errmsg = "unexpected end of data";
2051 startinpos = s-starts;
2052 endinpos = size;
2053 goto utf8Error;
2054 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 switch (n) {
2058
2059 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002060 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 startinpos = s-starts;
2062 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002063 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
2065 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002066 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 startinpos = s-starts;
2068 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002069 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070
2071 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002072 if ((s[1] & 0xc0) != 0x80) {
2073 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 startinpos = s-starts;
2075 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002076 goto utf8Error;
2077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002079 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
2081 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002082 errmsg = "illegal encoding";
2083 goto utf8Error;
2084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002086 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 break;
2088
2089 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002090 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002091 (s[2] & 0xc0) != 0x80) {
2092 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 startinpos = s-starts;
2094 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002095 goto utf8Error;
2096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002098 if (ch < 0x0800) {
2099 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002100 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002101
2102 XXX For wide builds (UCS-4) we should probably try
2103 to recombine the surrogates into a single code
2104 unit.
2105 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002106 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 startinpos = s-starts;
2108 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002109 goto utf8Error;
2110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002112 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 break;
2114
2115 case 4:
2116 if ((s[1] & 0xc0) != 0x80 ||
2117 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 (s[3] & 0xc0) != 0x80) {
2119 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = s-starts;
2121 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002122 goto utf8Error;
2123 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002124 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2125 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2126 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002128 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002130 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002132 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 startinpos = s-starts;
2134 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002135 goto utf8Error;
2136 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002137#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002138 *p++ = (Py_UNICODE)ch;
2139#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002140 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002141
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 /* translate from 10000..10FFFF to 0..FFFF */
2143 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002144
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002145 /* high surrogate = top 10 bits added to D800 */
2146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002147
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002148 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002150#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 break;
2152
2153 default:
2154 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002155 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 startinpos = s-starts;
2157 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002158 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 }
2160 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002161 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002163 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 outpos = p-PyUnicode_AS_UNICODE(unicode);
2165 if (unicode_decode_call_errorhandler(
2166 errors, &errorHandler,
2167 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002168 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 (PyObject **)&unicode, &outpos, &p))
2170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 }
Walter Dörwald69652032004-09-07 20:24:22 +00002172 if (consumed)
2173 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174
2175 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002176 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 goto onError;
2178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002179 Py_XDECREF(errorHandler);
2180 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 return (PyObject *)unicode;
2182
2183onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 Py_XDECREF(errorHandler);
2185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 Py_DECREF(unicode);
2187 return NULL;
2188}
2189
Tim Peters602f7402002-04-27 18:03:26 +00002190/* Allocation strategy: if the string is short, convert into a stack buffer
2191 and allocate exactly as much space needed at the end. Else allocate the
2192 maximum possible needed (4 result bytes per Unicode character), and return
2193 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002194*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002195PyObject *
2196PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002197 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199{
Tim Peters602f7402002-04-27 18:03:26 +00002200#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002201
Guido van Rossum98297ee2007-11-06 21:34:58 +00002202 Py_ssize_t i; /* index into s of next input byte */
2203 PyObject *result; /* result string object */
2204 char *p; /* next free byte in output buffer */
2205 Py_ssize_t nallocated; /* number of result bytes allocated */
2206 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002207 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002208
Tim Peters602f7402002-04-27 18:03:26 +00002209 assert(s != NULL);
2210 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Tim Peters602f7402002-04-27 18:03:26 +00002212 if (size <= MAX_SHORT_UNICHARS) {
2213 /* Write into the stack buffer; nallocated can't overflow.
2214 * At the end, we'll allocate exactly as much heap space as it
2215 * turns out we need.
2216 */
2217 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002218 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002219 p = stackbuf;
2220 }
2221 else {
2222 /* Overallocate on the heap, and give the excess back at the end. */
2223 nallocated = size * 4;
2224 if (nallocated / 4 != size) /* overflow! */
2225 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002226 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002227 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002228 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002229 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002230 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002231
Tim Peters602f7402002-04-27 18:03:26 +00002232 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002233 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002234
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002235 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002236 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002238
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002240 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002241 *p++ = (char)(0xc0 | (ch >> 6));
2242 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002244 else {
Tim Peters602f7402002-04-27 18:03:26 +00002245 /* Encode UCS2 Unicode ordinals */
2246 if (ch < 0x10000) {
2247 /* Special case: check for high surrogate */
2248 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2249 Py_UCS4 ch2 = s[i];
2250 /* Check for low surrogate and combine the two to
2251 form a UCS4 value */
2252 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002253 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002254 i++;
2255 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002256 }
Tim Peters602f7402002-04-27 18:03:26 +00002257 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002258 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002260 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2261 *p++ = (char)(0x80 | (ch & 0x3f));
2262 continue;
2263 }
2264encodeUCS4:
2265 /* Encode UCS4 Unicode ordinals */
2266 *p++ = (char)(0xf0 | (ch >> 18));
2267 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2268 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2269 *p++ = (char)(0x80 | (ch & 0x3f));
2270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002272
Guido van Rossum98297ee2007-11-06 21:34:58 +00002273 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002274 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002275 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002276 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002277 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002278 }
2279 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002280 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002281 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002282 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002283 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002284 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002285 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002286
Tim Peters602f7402002-04-27 18:03:26 +00002287#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288}
2289
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 if (!PyUnicode_Check(unicode)) {
2293 PyErr_BadArgument();
2294 return NULL;
2295 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002296 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2297 PyUnicode_GET_SIZE(unicode),
2298 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299}
2300
Walter Dörwald41980ca2007-08-16 21:55:45 +00002301/* --- UTF-32 Codec ------------------------------------------------------- */
2302
2303PyObject *
2304PyUnicode_DecodeUTF32(const char *s,
2305 Py_ssize_t size,
2306 const char *errors,
2307 int *byteorder)
2308{
2309 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2310}
2311
2312PyObject *
2313PyUnicode_DecodeUTF32Stateful(const char *s,
2314 Py_ssize_t size,
2315 const char *errors,
2316 int *byteorder,
2317 Py_ssize_t *consumed)
2318{
2319 const char *starts = s;
2320 Py_ssize_t startinpos;
2321 Py_ssize_t endinpos;
2322 Py_ssize_t outpos;
2323 PyUnicodeObject *unicode;
2324 Py_UNICODE *p;
2325#ifndef Py_UNICODE_WIDE
2326 int i, pairs;
2327#else
2328 const int pairs = 0;
2329#endif
2330 const unsigned char *q, *e;
2331 int bo = 0; /* assume native ordering by default */
2332 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002333 /* Offsets from q for retrieving bytes in the right order. */
2334#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2335 int iorder[] = {0, 1, 2, 3};
2336#else
2337 int iorder[] = {3, 2, 1, 0};
2338#endif
2339 PyObject *errorHandler = NULL;
2340 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002341 /* On narrow builds we split characters outside the BMP into two
2342 codepoints => count how much extra space we need. */
2343#ifndef Py_UNICODE_WIDE
2344 for (i = pairs = 0; i < size/4; i++)
2345 if (((Py_UCS4 *)s)[i] >= 0x10000)
2346 pairs++;
2347#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002348
2349 /* This might be one to much, because of a BOM */
2350 unicode = _PyUnicode_New((size+3)/4+pairs);
2351 if (!unicode)
2352 return NULL;
2353 if (size == 0)
2354 return (PyObject *)unicode;
2355
2356 /* Unpack UTF-32 encoded data */
2357 p = unicode->str;
2358 q = (unsigned char *)s;
2359 e = q + size;
2360
2361 if (byteorder)
2362 bo = *byteorder;
2363
2364 /* Check for BOM marks (U+FEFF) in the input and adjust current
2365 byte order setting accordingly. In native mode, the leading BOM
2366 mark is skipped, in all other modes, it is copied to the output
2367 stream as-is (giving a ZWNBSP character). */
2368 if (bo == 0) {
2369 if (size >= 4) {
2370 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2371 (q[iorder[1]] << 8) | q[iorder[0]];
2372#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2373 if (bom == 0x0000FEFF) {
2374 q += 4;
2375 bo = -1;
2376 }
2377 else if (bom == 0xFFFE0000) {
2378 q += 4;
2379 bo = 1;
2380 }
2381#else
2382 if (bom == 0x0000FEFF) {
2383 q += 4;
2384 bo = 1;
2385 }
2386 else if (bom == 0xFFFE0000) {
2387 q += 4;
2388 bo = -1;
2389 }
2390#endif
2391 }
2392 }
2393
2394 if (bo == -1) {
2395 /* force LE */
2396 iorder[0] = 0;
2397 iorder[1] = 1;
2398 iorder[2] = 2;
2399 iorder[3] = 3;
2400 }
2401 else if (bo == 1) {
2402 /* force BE */
2403 iorder[0] = 3;
2404 iorder[1] = 2;
2405 iorder[2] = 1;
2406 iorder[3] = 0;
2407 }
2408
2409 while (q < e) {
2410 Py_UCS4 ch;
2411 /* remaining bytes at the end? (size should be divisible by 4) */
2412 if (e-q<4) {
2413 if (consumed)
2414 break;
2415 errmsg = "truncated data";
2416 startinpos = ((const char *)q)-starts;
2417 endinpos = ((const char *)e)-starts;
2418 goto utf32Error;
2419 /* The remaining input chars are ignored if the callback
2420 chooses to skip the input */
2421 }
2422 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2423 (q[iorder[1]] << 8) | q[iorder[0]];
2424
2425 if (ch >= 0x110000)
2426 {
2427 errmsg = "codepoint not in range(0x110000)";
2428 startinpos = ((const char *)q)-starts;
2429 endinpos = startinpos+4;
2430 goto utf32Error;
2431 }
2432#ifndef Py_UNICODE_WIDE
2433 if (ch >= 0x10000)
2434 {
2435 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2436 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2437 }
2438 else
2439#endif
2440 *p++ = ch;
2441 q += 4;
2442 continue;
2443 utf32Error:
2444 outpos = p-PyUnicode_AS_UNICODE(unicode);
2445 if (unicode_decode_call_errorhandler(
2446 errors, &errorHandler,
2447 "utf32", errmsg,
2448 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2449 (PyObject **)&unicode, &outpos, &p))
2450 goto onError;
2451 }
2452
2453 if (byteorder)
2454 *byteorder = bo;
2455
2456 if (consumed)
2457 *consumed = (const char *)q-starts;
2458
2459 /* Adjust length */
2460 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2461 goto onError;
2462
2463 Py_XDECREF(errorHandler);
2464 Py_XDECREF(exc);
2465 return (PyObject *)unicode;
2466
2467onError:
2468 Py_DECREF(unicode);
2469 Py_XDECREF(errorHandler);
2470 Py_XDECREF(exc);
2471 return NULL;
2472}
2473
2474PyObject *
2475PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2476 Py_ssize_t size,
2477 const char *errors,
2478 int byteorder)
2479{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002480 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002481 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002482 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002484 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485#else
2486 const int pairs = 0;
2487#endif
2488 /* Offsets from p for storing byte pairs in the right order. */
2489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2490 int iorder[] = {0, 1, 2, 3};
2491#else
2492 int iorder[] = {3, 2, 1, 0};
2493#endif
2494
2495#define STORECHAR(CH) \
2496 do { \
2497 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2498 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2499 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2500 p[iorder[0]] = (CH) & 0xff; \
2501 p += 4; \
2502 } while(0)
2503
2504 /* In narrow builds we can output surrogate pairs as one codepoint,
2505 so we need less space. */
2506#ifndef Py_UNICODE_WIDE
2507 for (i = pairs = 0; i < size-1; i++)
2508 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2509 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2510 pairs++;
2511#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002512 nsize = (size - pairs + (byteorder == 0));
2513 bytesize = nsize * 4;
2514 if (bytesize / 4 != nsize)
2515 return PyErr_NoMemory();
2516 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002517 if (v == NULL)
2518 return NULL;
2519
Christian Heimes9c4756e2008-05-26 13:22:05 +00002520 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002521 if (byteorder == 0)
2522 STORECHAR(0xFEFF);
2523 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002524 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002525
2526 if (byteorder == -1) {
2527 /* force LE */
2528 iorder[0] = 0;
2529 iorder[1] = 1;
2530 iorder[2] = 2;
2531 iorder[3] = 3;
2532 }
2533 else if (byteorder == 1) {
2534 /* force BE */
2535 iorder[0] = 3;
2536 iorder[1] = 2;
2537 iorder[2] = 1;
2538 iorder[3] = 0;
2539 }
2540
2541 while (size-- > 0) {
2542 Py_UCS4 ch = *s++;
2543#ifndef Py_UNICODE_WIDE
2544 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2545 Py_UCS4 ch2 = *s;
2546 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2547 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2548 s++;
2549 size--;
2550 }
2551 }
2552#endif
2553 STORECHAR(ch);
2554 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002555
2556 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002557 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 Py_DECREF(v);
2559 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002560#undef STORECHAR
2561}
2562
2563PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2564{
2565 if (!PyUnicode_Check(unicode)) {
2566 PyErr_BadArgument();
2567 return NULL;
2568 }
2569 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2570 PyUnicode_GET_SIZE(unicode),
2571 NULL,
2572 0);
2573}
2574
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575/* --- UTF-16 Codec ------------------------------------------------------- */
2576
Tim Peters772747b2001-08-09 22:21:55 +00002577PyObject *
2578PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002580 const char *errors,
2581 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582{
Walter Dörwald69652032004-09-07 20:24:22 +00002583 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2584}
2585
2586PyObject *
2587PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002588 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002589 const char *errors,
2590 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002591 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002594 Py_ssize_t startinpos;
2595 Py_ssize_t endinpos;
2596 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 PyUnicodeObject *unicode;
2598 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002599 const unsigned char *q, *e;
2600 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002601 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002602 /* Offsets from q for retrieving byte pairs in the right order. */
2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2604 int ihi = 1, ilo = 0;
2605#else
2606 int ihi = 0, ilo = 1;
2607#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 PyObject *errorHandler = NULL;
2609 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 /* Note: size will always be longer than the resulting Unicode
2612 character count */
2613 unicode = _PyUnicode_New(size);
2614 if (!unicode)
2615 return NULL;
2616 if (size == 0)
2617 return (PyObject *)unicode;
2618
2619 /* Unpack UTF-16 encoded data */
2620 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002621 q = (unsigned char *)s;
2622 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623
2624 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002625 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002627 /* Check for BOM marks (U+FEFF) in the input and adjust current
2628 byte order setting accordingly. In native mode, the leading BOM
2629 mark is skipped, in all other modes, it is copied to the output
2630 stream as-is (giving a ZWNBSP character). */
2631 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002632 if (size >= 2) {
2633 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002634#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002635 if (bom == 0xFEFF) {
2636 q += 2;
2637 bo = -1;
2638 }
2639 else if (bom == 0xFFFE) {
2640 q += 2;
2641 bo = 1;
2642 }
Tim Petersced69f82003-09-16 20:30:58 +00002643#else
Walter Dörwald69652032004-09-07 20:24:22 +00002644 if (bom == 0xFEFF) {
2645 q += 2;
2646 bo = 1;
2647 }
2648 else if (bom == 0xFFFE) {
2649 q += 2;
2650 bo = -1;
2651 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002652#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002653 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
Tim Peters772747b2001-08-09 22:21:55 +00002656 if (bo == -1) {
2657 /* force LE */
2658 ihi = 1;
2659 ilo = 0;
2660 }
2661 else if (bo == 1) {
2662 /* force BE */
2663 ihi = 0;
2664 ilo = 1;
2665 }
2666
2667 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002669 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002671 if (consumed)
2672 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 errmsg = "truncated data";
2674 startinpos = ((const char *)q)-starts;
2675 endinpos = ((const char *)e)-starts;
2676 goto utf16Error;
2677 /* The remaining input chars are ignored if the callback
2678 chooses to skip the input */
2679 }
2680 ch = (q[ihi] << 8) | q[ilo];
2681
Tim Peters772747b2001-08-09 22:21:55 +00002682 q += 2;
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (ch < 0xD800 || ch > 0xDFFF) {
2685 *p++ = ch;
2686 continue;
2687 }
2688
2689 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002690 if (q >= e) {
2691 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 startinpos = (((const char *)q)-2)-starts;
2693 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002694 goto utf16Error;
2695 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002696 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002697 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2698 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002699 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002700#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002701 *p++ = ch;
2702 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002703#else
2704 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002706 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002707 }
2708 else {
2709 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 startinpos = (((const char *)q)-4)-starts;
2711 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002712 goto utf16Error;
2713 }
2714
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002716 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 startinpos = (((const char *)q)-2)-starts;
2718 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002719 /* Fall through to report the error */
2720
2721 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 outpos = p-PyUnicode_AS_UNICODE(unicode);
2723 if (unicode_decode_call_errorhandler(
2724 errors, &errorHandler,
2725 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002726 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 }
2730
2731 if (byteorder)
2732 *byteorder = bo;
2733
Walter Dörwald69652032004-09-07 20:24:22 +00002734 if (consumed)
2735 *consumed = (const char *)q-starts;
2736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002738 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 goto onError;
2740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 Py_XDECREF(errorHandler);
2742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 return (PyObject *)unicode;
2744
2745onError:
2746 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 return NULL;
2750}
2751
Tim Peters772747b2001-08-09 22:21:55 +00002752PyObject *
2753PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002754 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002755 const char *errors,
2756 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002758 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002759 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002760 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002761#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002762 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002763#else
2764 const int pairs = 0;
2765#endif
Tim Peters772747b2001-08-09 22:21:55 +00002766 /* Offsets from p for storing byte pairs in the right order. */
2767#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2768 int ihi = 1, ilo = 0;
2769#else
2770 int ihi = 0, ilo = 1;
2771#endif
2772
2773#define STORECHAR(CH) \
2774 do { \
2775 p[ihi] = ((CH) >> 8) & 0xff; \
2776 p[ilo] = (CH) & 0xff; \
2777 p += 2; \
2778 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002781 for (i = pairs = 0; i < size; i++)
2782 if (s[i] >= 0x10000)
2783 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002784#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002785 /* 2 * (size + pairs + (byteorder == 0)) */
2786 if (size > PY_SSIZE_T_MAX ||
2787 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2788 return PyErr_NoMemory();
2789 nsize = size + pairs + (byteorder == 0);
2790 bytesize = nsize * 2;
2791 if (bytesize / 2 != nsize)
2792 return PyErr_NoMemory();
2793 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 if (v == NULL)
2795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796
Christian Heimes9c4756e2008-05-26 13:22:05 +00002797 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002799 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002800 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002801 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002802
2803 if (byteorder == -1) {
2804 /* force LE */
2805 ihi = 1;
2806 ilo = 0;
2807 }
2808 else if (byteorder == 1) {
2809 /* force BE */
2810 ihi = 0;
2811 ilo = 1;
2812 }
2813
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002814 while (size-- > 0) {
2815 Py_UNICODE ch = *s++;
2816 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002817#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002818 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002819 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2820 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002822#endif
Tim Peters772747b2001-08-09 22:21:55 +00002823 STORECHAR(ch);
2824 if (ch2)
2825 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002826 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002827
2828 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002829 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002830 Py_DECREF(v);
2831 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002832#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833}
2834
2835PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2836{
2837 if (!PyUnicode_Check(unicode)) {
2838 PyErr_BadArgument();
2839 return NULL;
2840 }
2841 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2842 PyUnicode_GET_SIZE(unicode),
2843 NULL,
2844 0);
2845}
2846
2847/* --- Unicode Escape Codec ----------------------------------------------- */
2848
Fredrik Lundh06d12682001-01-24 07:59:11 +00002849static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 const char *errors)
2854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002856 Py_ssize_t startinpos;
2857 Py_ssize_t endinpos;
2858 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002863 char* message;
2864 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 PyObject *errorHandler = NULL;
2866 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002867
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 /* Escaped strings will always be longer than the resulting
2869 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 length after conversion to the true value.
2871 (but if the error callback returns a long replacement string
2872 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 v = _PyUnicode_New(size);
2874 if (v == NULL)
2875 goto onError;
2876 if (size == 0)
2877 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 while (s < end) {
2883 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002884 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
2887 /* Non-escape characters are interpreted as Unicode ordinals */
2888 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 continue;
2891 }
2892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 /* \ - Escapes */
2895 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002896 c = *s++;
2897 if (s > end)
2898 c = '\0'; /* Invalid after \ */
2899 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
2901 /* \x escapes */
2902 case '\n': break;
2903 case '\\': *p++ = '\\'; break;
2904 case '\'': *p++ = '\''; break;
2905 case '\"': *p++ = '\"'; break;
2906 case 'b': *p++ = '\b'; break;
2907 case 'f': *p++ = '\014'; break; /* FF */
2908 case 't': *p++ = '\t'; break;
2909 case 'n': *p++ = '\n'; break;
2910 case 'r': *p++ = '\r'; break;
2911 case 'v': *p++ = '\013'; break; /* VT */
2912 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2913
2914 /* \OOO (octal) escapes */
2915 case '0': case '1': case '2': case '3':
2916 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002917 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002918 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002919 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002920 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002921 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002923 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 break;
2925
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 /* hex escapes */
2927 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002929 digits = 2;
2930 message = "truncated \\xXX escape";
2931 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Fredrik Lundhccc74732001-02-18 22:13:49 +00002933 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 digits = 4;
2936 message = "truncated \\uXXXX escape";
2937 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938
Fredrik Lundhccc74732001-02-18 22:13:49 +00002939 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002940 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002941 digits = 8;
2942 message = "truncated \\UXXXXXXXX escape";
2943 hexescape:
2944 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 outpos = p-PyUnicode_AS_UNICODE(v);
2946 if (s+digits>end) {
2947 endinpos = size;
2948 if (unicode_decode_call_errorhandler(
2949 errors, &errorHandler,
2950 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002951 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 (PyObject **)&v, &outpos, &p))
2953 goto onError;
2954 goto nextByte;
2955 }
2956 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002957 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002958 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 endinpos = (s+i+1)-starts;
2960 if (unicode_decode_call_errorhandler(
2961 errors, &errorHandler,
2962 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002967 }
2968 chr = (chr<<4) & ~0xF;
2969 if (c >= '0' && c <= '9')
2970 chr += c - '0';
2971 else if (c >= 'a' && c <= 'f')
2972 chr += 10 + c - 'a';
2973 else
2974 chr += 10 + c - 'A';
2975 }
2976 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002977 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 /* _decoding_error will have already written into the
2979 target buffer. */
2980 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002981 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002982 /* when we get here, chr is a 32-bit unicode character */
2983 if (chr <= 0xffff)
2984 /* UCS-2 character */
2985 *p++ = (Py_UNICODE) chr;
2986 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002987 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002988 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002989#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002990 *p++ = chr;
2991#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002992 chr -= 0x10000L;
2993 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002994 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002995#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002996 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 endinpos = s-starts;
2998 outpos = p-PyUnicode_AS_UNICODE(v);
2999 if (unicode_decode_call_errorhandler(
3000 errors, &errorHandler,
3001 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003002 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003004 goto onError;
3005 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003006 break;
3007
3008 /* \N{name} */
3009 case 'N':
3010 message = "malformed \\N character escape";
3011 if (ucnhash_CAPI == NULL) {
3012 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003013 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003014 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003015 if (m == NULL)
3016 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003017 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003018 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003019 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003020 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003021 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003022 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003023 if (ucnhash_CAPI == NULL)
3024 goto ucnhashError;
3025 }
3026 if (*s == '{') {
3027 const char *start = s+1;
3028 /* look for the closing brace */
3029 while (*s != '}' && s < end)
3030 s++;
3031 if (s > start && s < end && *s == '}') {
3032 /* found a name. look it up in the unicode database */
3033 message = "unknown Unicode character name";
3034 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003035 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003036 goto store;
3037 }
3038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 endinpos = s-starts;
3040 outpos = p-PyUnicode_AS_UNICODE(v);
3041 if (unicode_decode_call_errorhandler(
3042 errors, &errorHandler,
3043 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003044 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003046 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003047 break;
3048
3049 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003050 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 message = "\\ at end of string";
3052 s--;
3053 endinpos = s-starts;
3054 outpos = p-PyUnicode_AS_UNICODE(v);
3055 if (unicode_decode_call_errorhandler(
3056 errors, &errorHandler,
3057 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003058 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003060 goto onError;
3061 }
3062 else {
3063 *p++ = '\\';
3064 *p++ = (unsigned char)s[-1];
3065 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003066 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 nextByte:
3069 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003071 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003073 Py_XDECREF(errorHandler);
3074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003076
Fredrik Lundhccc74732001-02-18 22:13:49 +00003077ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003078 PyErr_SetString(
3079 PyExc_UnicodeError,
3080 "\\N escapes not supported (can't load unicodedata module)"
3081 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003082 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 Py_XDECREF(errorHandler);
3084 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003085 return NULL;
3086
Fredrik Lundhccc74732001-02-18 22:13:49 +00003087onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 Py_XDECREF(errorHandler);
3090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 return NULL;
3092}
3093
3094/* Return a Unicode-Escape string version of the Unicode object.
3095
3096 If quotes is true, the string is enclosed in u"" or u'' quotes as
3097 appropriate.
3098
3099*/
3100
Thomas Wouters477c8d52006-05-27 19:21:47 +00003101Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3102 Py_ssize_t size,
3103 Py_UNICODE ch)
3104{
3105 /* like wcschr, but doesn't stop at NULL characters */
3106
3107 while (size-- > 0) {
3108 if (*s == ch)
3109 return s;
3110 s++;
3111 }
3112
3113 return NULL;
3114}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003115
Walter Dörwald79e913e2007-05-12 11:08:06 +00003116static const char *hexdigits = "0123456789abcdef";
3117
3118PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3119 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003121 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003124#ifdef Py_UNICODE_WIDE
3125 const Py_ssize_t expandsize = 10;
3126#else
3127 const Py_ssize_t expandsize = 6;
3128#endif
3129
Thomas Wouters89f507f2006-12-13 04:49:30 +00003130 /* XXX(nnorwitz): rather than over-allocating, it would be
3131 better to choose a different scheme. Perhaps scan the
3132 first N-chars of the string and allocate based on that size.
3133 */
3134 /* Initial allocation is based on the longest-possible unichr
3135 escape.
3136
3137 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3138 unichr, so in this case it's the longest unichr escape. In
3139 narrow (UTF-16) builds this is five chars per source unichr
3140 since there are two unichrs in the surrogate pair, so in narrow
3141 (UTF-16) builds it's not the longest unichr escape.
3142
3143 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3144 so in the narrow (UTF-16) build case it's the longest unichr
3145 escape.
3146 */
3147
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003148 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3149 return PyErr_NoMemory();
3150
Christian Heimes9c4756e2008-05-26 13:22:05 +00003151 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003152 2
3153 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003154 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (repr == NULL)
3156 return NULL;
3157
Christian Heimes9c4756e2008-05-26 13:22:05 +00003158 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 while (size-- > 0) {
3161 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003162
Walter Dörwald79e913e2007-05-12 11:08:06 +00003163 /* Escape backslashes */
3164 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 *p++ = '\\';
3166 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003167 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003168 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003169
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003170#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003171 /* Map 21-bit characters to '\U00xxxxxx' */
3172 else if (ch >= 0x10000) {
3173 *p++ = '\\';
3174 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003175 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3176 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3177 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3178 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3179 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3180 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3181 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3182 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003183 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003184 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003185#else
3186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003187 else if (ch >= 0xD800 && ch < 0xDC00) {
3188 Py_UNICODE ch2;
3189 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003190
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003191 ch2 = *s++;
3192 size--;
3193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3195 *p++ = '\\';
3196 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003197 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3198 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3199 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3200 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3201 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3202 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3203 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3204 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003205 continue;
3206 }
3207 /* Fall through: isolated surrogates are copied as-is */
3208 s--;
3209 size++;
3210 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003211#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003214 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 *p++ = '\\';
3216 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003217 *p++ = hexdigits[(ch >> 12) & 0x000F];
3218 *p++ = hexdigits[(ch >> 8) & 0x000F];
3219 *p++ = hexdigits[(ch >> 4) & 0x000F];
3220 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003222
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003223 /* Map special whitespace to '\t', \n', '\r' */
3224 else if (ch == '\t') {
3225 *p++ = '\\';
3226 *p++ = 't';
3227 }
3228 else if (ch == '\n') {
3229 *p++ = '\\';
3230 *p++ = 'n';
3231 }
3232 else if (ch == '\r') {
3233 *p++ = '\\';
3234 *p++ = 'r';
3235 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003236
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003237 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003238 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003240 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003241 *p++ = hexdigits[(ch >> 4) & 0x000F];
3242 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 /* Copy everything else as-is */
3246 else
3247 *p++ = (char) ch;
3248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249
Christian Heimes72b710a2008-05-26 13:28:38 +00003250 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003251 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003252 Py_DECREF(repr);
3253 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254}
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3257{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003258 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 if (!PyUnicode_Check(unicode)) {
3260 PyErr_BadArgument();
3261 return NULL;
3262 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003263 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3264 PyUnicode_GET_SIZE(unicode));
3265
3266 if (!s)
3267 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003268 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003269 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003270 Py_DECREF(s);
3271 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272}
3273
3274/* --- Raw Unicode Escape Codec ------------------------------------------- */
3275
3276PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 const char *errors)
3279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t startinpos;
3282 Py_ssize_t endinpos;
3283 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 const char *end;
3287 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 PyObject *errorHandler = NULL;
3289 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003290
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 /* Escaped strings will always be longer than the resulting
3292 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 length after conversion to the true value. (But decoding error
3294 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 v = _PyUnicode_New(size);
3296 if (v == NULL)
3297 goto onError;
3298 if (size == 0)
3299 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 end = s + size;
3302 while (s < end) {
3303 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003304 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003306 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307
3308 /* Non-escape characters are interpreted as Unicode ordinals */
3309 if (*s != '\\') {
3310 *p++ = (unsigned char)*s++;
3311 continue;
3312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314
3315 /* \u-escapes are only interpreted iff the number of leading
3316 backslashes if odd */
3317 bs = s;
3318 for (;s < end;) {
3319 if (*s != '\\')
3320 break;
3321 *p++ = (unsigned char)*s++;
3322 }
3323 if (((s - bs) & 1) == 0 ||
3324 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 continue;
3327 }
3328 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003329 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 s++;
3331
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003332 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003336 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 endinpos = s-starts;
3338 if (unicode_decode_call_errorhandler(
3339 errors, &errorHandler,
3340 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003341 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 }
3346 x = (x<<4) & ~0xF;
3347 if (c >= '0' && c <= '9')
3348 x += c - '0';
3349 else if (c >= 'a' && c <= 'f')
3350 x += 10 + c - 'a';
3351 else
3352 x += 10 + c - 'A';
3353 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003354 if (x <= 0xffff)
3355 /* UCS-2 character */
3356 *p++ = (Py_UNICODE) x;
3357 else if (x <= 0x10ffff) {
3358 /* UCS-4 character. Either store directly, or as
3359 surrogate pair. */
3360#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003361 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003362#else
3363 x -= 0x10000L;
3364 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3365 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3366#endif
3367 } else {
3368 endinpos = s-starts;
3369 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003370 if (unicode_decode_call_errorhandler(
3371 errors, &errorHandler,
3372 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003373 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003374 (PyObject **)&v, &outpos, &p))
3375 goto onError;
3376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 nextByte:
3378 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003380 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003381 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 Py_XDECREF(errorHandler);
3383 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003385
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 onError:
3387 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 Py_XDECREF(errorHandler);
3389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 return NULL;
3391}
3392
3393PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003396 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 char *p;
3398 char *q;
3399
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003400#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003401 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003402#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003403 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003404#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003405
3406 if (size > PY_SSIZE_T_MAX / expandsize)
3407 return PyErr_NoMemory();
3408
3409 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 if (repr == NULL)
3411 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003412 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003413 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414
Christian Heimes9c4756e2008-05-26 13:22:05 +00003415 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 while (size-- > 0) {
3417 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003418#ifdef Py_UNICODE_WIDE
3419 /* Map 32-bit characters to '\Uxxxxxxxx' */
3420 if (ch >= 0x10000) {
3421 *p++ = '\\';
3422 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003423 *p++ = hexdigits[(ch >> 28) & 0xf];
3424 *p++ = hexdigits[(ch >> 24) & 0xf];
3425 *p++ = hexdigits[(ch >> 20) & 0xf];
3426 *p++ = hexdigits[(ch >> 16) & 0xf];
3427 *p++ = hexdigits[(ch >> 12) & 0xf];
3428 *p++ = hexdigits[(ch >> 8) & 0xf];
3429 *p++ = hexdigits[(ch >> 4) & 0xf];
3430 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003431 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003432 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003433#else
3434 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3435 if (ch >= 0xD800 && ch < 0xDC00) {
3436 Py_UNICODE ch2;
3437 Py_UCS4 ucs;
3438
3439 ch2 = *s++;
3440 size--;
3441 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3442 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3443 *p++ = '\\';
3444 *p++ = 'U';
3445 *p++ = hexdigits[(ucs >> 28) & 0xf];
3446 *p++ = hexdigits[(ucs >> 24) & 0xf];
3447 *p++ = hexdigits[(ucs >> 20) & 0xf];
3448 *p++ = hexdigits[(ucs >> 16) & 0xf];
3449 *p++ = hexdigits[(ucs >> 12) & 0xf];
3450 *p++ = hexdigits[(ucs >> 8) & 0xf];
3451 *p++ = hexdigits[(ucs >> 4) & 0xf];
3452 *p++ = hexdigits[ucs & 0xf];
3453 continue;
3454 }
3455 /* Fall through: isolated surrogates are copied as-is */
3456 s--;
3457 size++;
3458 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003459#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 /* Map 16-bit characters to '\uxxxx' */
3461 if (ch >= 256) {
3462 *p++ = '\\';
3463 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003464 *p++ = hexdigits[(ch >> 12) & 0xf];
3465 *p++ = hexdigits[(ch >> 8) & 0xf];
3466 *p++ = hexdigits[(ch >> 4) & 0xf];
3467 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
3469 /* Copy everything else as-is */
3470 else
3471 *p++ = (char) ch;
3472 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003473 size = p - q;
3474
3475 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003476 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003477 Py_DECREF(repr);
3478 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479}
3480
3481PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3482{
Walter Dörwald711005d2007-05-12 12:03:26 +00003483 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003485 PyErr_BadArgument();
3486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003488 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3489 PyUnicode_GET_SIZE(unicode));
3490
3491 if (!s)
3492 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003493 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003494 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003495 Py_DECREF(s);
3496 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497}
3498
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003499/* --- Unicode Internal Codec ------------------------------------------- */
3500
3501PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003503 const char *errors)
3504{
3505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t startinpos;
3507 Py_ssize_t endinpos;
3508 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003509 PyUnicodeObject *v;
3510 Py_UNICODE *p;
3511 const char *end;
3512 const char *reason;
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515
Neal Norwitzd43069c2006-01-08 01:12:10 +00003516#ifdef Py_UNICODE_WIDE
3517 Py_UNICODE unimax = PyUnicode_GetMax();
3518#endif
3519
Thomas Wouters89f507f2006-12-13 04:49:30 +00003520 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003521 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3522 if (v == NULL)
3523 goto onError;
3524 if (PyUnicode_GetSize((PyObject *)v) == 0)
3525 return (PyObject *)v;
3526 p = PyUnicode_AS_UNICODE(v);
3527 end = s + size;
3528
3529 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003530 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003531 /* We have to sanity check the raw data, otherwise doom looms for
3532 some malformed UCS-4 data. */
3533 if (
3534 #ifdef Py_UNICODE_WIDE
3535 *p > unimax || *p < 0 ||
3536 #endif
3537 end-s < Py_UNICODE_SIZE
3538 )
3539 {
3540 startinpos = s - starts;
3541 if (end-s < Py_UNICODE_SIZE) {
3542 endinpos = end-starts;
3543 reason = "truncated input";
3544 }
3545 else {
3546 endinpos = s - starts + Py_UNICODE_SIZE;
3547 reason = "illegal code point (> 0x10FFFF)";
3548 }
3549 outpos = p - PyUnicode_AS_UNICODE(v);
3550 if (unicode_decode_call_errorhandler(
3551 errors, &errorHandler,
3552 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003553 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003554 (PyObject **)&v, &outpos, &p)) {
3555 goto onError;
3556 }
3557 }
3558 else {
3559 p++;
3560 s += Py_UNICODE_SIZE;
3561 }
3562 }
3563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003564 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003565 goto onError;
3566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
3568 return (PyObject *)v;
3569
3570 onError:
3571 Py_XDECREF(v);
3572 Py_XDECREF(errorHandler);
3573 Py_XDECREF(exc);
3574 return NULL;
3575}
3576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577/* --- Latin-1 Codec ------------------------------------------------------ */
3578
3579PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 const char *errors)
3582{
3583 PyUnicodeObject *v;
3584 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003587 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003588 Py_UNICODE r = *(unsigned char*)s;
3589 return PyUnicode_FromUnicode(&r, 1);
3590 }
3591
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 v = _PyUnicode_New(size);
3593 if (v == NULL)
3594 goto onError;
3595 if (size == 0)
3596 return (PyObject *)v;
3597 p = PyUnicode_AS_UNICODE(v);
3598 while (size-- > 0)
3599 *p++ = (unsigned char)*s++;
3600 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003601
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 onError:
3603 Py_XDECREF(v);
3604 return NULL;
3605}
3606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607/* create or adjust a UnicodeEncodeError */
3608static void make_encode_exception(PyObject **exceptionObject,
3609 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 const Py_UNICODE *unicode, Py_ssize_t size,
3611 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 if (*exceptionObject == NULL) {
3615 *exceptionObject = PyUnicodeEncodeError_Create(
3616 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 }
3618 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3620 goto onError;
3621 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3622 goto onError;
3623 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3624 goto onError;
3625 return;
3626 onError:
3627 Py_DECREF(*exceptionObject);
3628 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 }
3630}
3631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632/* raises a UnicodeEncodeError */
3633static void raise_encode_exception(PyObject **exceptionObject,
3634 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003635 const Py_UNICODE *unicode, Py_ssize_t size,
3636 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 const char *reason)
3638{
3639 make_encode_exception(exceptionObject,
3640 encoding, unicode, size, startpos, endpos, reason);
3641 if (*exceptionObject != NULL)
3642 PyCodec_StrictErrors(*exceptionObject);
3643}
3644
3645/* error handling callback helper:
3646 build arguments, call the callback and check the arguments,
3647 put the result into newpos and return the replacement string, which
3648 has to be freed by the caller */
3649static PyObject *unicode_encode_call_errorhandler(const char *errors,
3650 PyObject **errorHandler,
3651 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3653 Py_ssize_t startpos, Py_ssize_t endpos,
3654 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003656 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657
3658 PyObject *restuple;
3659 PyObject *resunicode;
3660
3661 if (*errorHandler == NULL) {
3662 *errorHandler = PyCodec_LookupError(errors);
3663 if (*errorHandler == NULL)
3664 return NULL;
3665 }
3666
3667 make_encode_exception(exceptionObject,
3668 encoding, unicode, size, startpos, endpos, reason);
3669 if (*exceptionObject == NULL)
3670 return NULL;
3671
3672 restuple = PyObject_CallFunctionObjArgs(
3673 *errorHandler, *exceptionObject, NULL);
3674 if (restuple == NULL)
3675 return NULL;
3676 if (!PyTuple_Check(restuple)) {
3677 PyErr_Format(PyExc_TypeError, &argparse[4]);
3678 Py_DECREF(restuple);
3679 return NULL;
3680 }
3681 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3682 &resunicode, newpos)) {
3683 Py_DECREF(restuple);
3684 return NULL;
3685 }
3686 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003687 *newpos = size+*newpos;
3688 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003689 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003690 Py_DECREF(restuple);
3691 return NULL;
3692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 Py_INCREF(resunicode);
3694 Py_DECREF(restuple);
3695 return resunicode;
3696}
3697
3698static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003699 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 const char *errors,
3701 int limit)
3702{
3703 /* output object */
3704 PyObject *res;
3705 /* pointers to the beginning and end+1 of input */
3706 const Py_UNICODE *startp = p;
3707 const Py_UNICODE *endp = p + size;
3708 /* pointer to the beginning of the unencodable characters */
3709 /* const Py_UNICODE *badp = NULL; */
3710 /* pointer into the output */
3711 char *str;
3712 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003713 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003714 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3715 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 PyObject *errorHandler = NULL;
3717 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003718 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* the following variable is used for caching string comparisons
3720 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3721 int known_errorHandler = -1;
3722
3723 /* allocate enough for a simple encoding without
3724 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003725 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003726 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003727 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003729 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003730 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 ressize = size;
3732
3733 while (p<endp) {
3734 Py_UNICODE c = *p;
3735
3736 /* can we encode this? */
3737 if (c<limit) {
3738 /* no overflow check, because we know that the space is enough */
3739 *str++ = (char)c;
3740 ++p;
3741 }
3742 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003743 Py_ssize_t unicodepos = p-startp;
3744 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003746 Py_ssize_t repsize;
3747 Py_ssize_t newpos;
3748 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 Py_UNICODE *uni2;
3750 /* startpos for collecting unencodable chars */
3751 const Py_UNICODE *collstart = p;
3752 const Py_UNICODE *collend = p;
3753 /* find all unecodable characters */
3754 while ((collend < endp) && ((*collend)>=limit))
3755 ++collend;
3756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3757 if (known_errorHandler==-1) {
3758 if ((errors==NULL) || (!strcmp(errors, "strict")))
3759 known_errorHandler = 1;
3760 else if (!strcmp(errors, "replace"))
3761 known_errorHandler = 2;
3762 else if (!strcmp(errors, "ignore"))
3763 known_errorHandler = 3;
3764 else if (!strcmp(errors, "xmlcharrefreplace"))
3765 known_errorHandler = 4;
3766 else
3767 known_errorHandler = 0;
3768 }
3769 switch (known_errorHandler) {
3770 case 1: /* strict */
3771 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3772 goto onError;
3773 case 2: /* replace */
3774 while (collstart++<collend)
3775 *str++ = '?'; /* fall through */
3776 case 3: /* ignore */
3777 p = collend;
3778 break;
3779 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003780 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 /* determine replacement size (temporarily (mis)uses p) */
3782 for (p = collstart, repsize = 0; p < collend; ++p) {
3783 if (*p<10)
3784 repsize += 2+1+1;
3785 else if (*p<100)
3786 repsize += 2+2+1;
3787 else if (*p<1000)
3788 repsize += 2+3+1;
3789 else if (*p<10000)
3790 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003791#ifndef Py_UNICODE_WIDE
3792 else
3793 repsize += 2+5+1;
3794#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 else if (*p<100000)
3796 repsize += 2+5+1;
3797 else if (*p<1000000)
3798 repsize += 2+6+1;
3799 else
3800 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003801#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 }
3803 requiredsize = respos+repsize+(endp-collend);
3804 if (requiredsize > ressize) {
3805 if (requiredsize<2*ressize)
3806 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003807 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003809 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 ressize = requiredsize;
3811 }
3812 /* generate replacement (temporarily (mis)uses p) */
3813 for (p = collstart; p < collend; ++p) {
3814 str += sprintf(str, "&#%d;", (int)*p);
3815 }
3816 p = collend;
3817 break;
3818 default:
3819 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3820 encoding, reason, startp, size, &exc,
3821 collstart-startp, collend-startp, &newpos);
3822 if (repunicode == NULL)
3823 goto onError;
3824 /* need more space? (at least enough for what we
3825 have+the replacement+the rest of the string, so
3826 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003827 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 repsize = PyUnicode_GET_SIZE(repunicode);
3829 requiredsize = respos+repsize+(endp-collend);
3830 if (requiredsize > ressize) {
3831 if (requiredsize<2*ressize)
3832 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003833 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_DECREF(repunicode);
3835 goto onError;
3836 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003837 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 ressize = requiredsize;
3839 }
3840 /* check if there is anything unencodable in the replacement
3841 and copy it to the output */
3842 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3843 c = *uni2;
3844 if (c >= limit) {
3845 raise_encode_exception(&exc, encoding, startp, size,
3846 unicodepos, unicodepos+1, reason);
3847 Py_DECREF(repunicode);
3848 goto onError;
3849 }
3850 *str = (char)c;
3851 }
3852 p = startp + newpos;
3853 Py_DECREF(repunicode);
3854 }
3855 }
3856 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003857 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003858 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003859 onError:
3860 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 Py_XDECREF(errorHandler);
3862 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003863 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864}
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 const char *errors)
3869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871}
3872
3873PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3874{
3875 if (!PyUnicode_Check(unicode)) {
3876 PyErr_BadArgument();
3877 return NULL;
3878 }
3879 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3880 PyUnicode_GET_SIZE(unicode),
3881 NULL);
3882}
3883
3884/* --- 7-bit ASCII Codec -------------------------------------------------- */
3885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003887 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 const char *errors)
3889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyUnicodeObject *v;
3892 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t startinpos;
3894 Py_ssize_t endinpos;
3895 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 const char *e;
3897 PyObject *errorHandler = NULL;
3898 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003899
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003901 if (size == 1 && *(unsigned char*)s < 128) {
3902 Py_UNICODE r = *(unsigned char*)s;
3903 return PyUnicode_FromUnicode(&r, 1);
3904 }
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 v = _PyUnicode_New(size);
3907 if (v == NULL)
3908 goto onError;
3909 if (size == 0)
3910 return (PyObject *)v;
3911 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 e = s + size;
3913 while (s < e) {
3914 register unsigned char c = (unsigned char)*s;
3915 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 ++s;
3918 }
3919 else {
3920 startinpos = s-starts;
3921 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003922 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 if (unicode_decode_call_errorhandler(
3924 errors, &errorHandler,
3925 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003926 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003931 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003932 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003933 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 Py_XDECREF(errorHandler);
3935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003937
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 onError:
3939 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 Py_XDECREF(errorHandler);
3941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 return NULL;
3943}
3944
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 const char *errors)
3948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950}
3951
3952PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3953{
3954 if (!PyUnicode_Check(unicode)) {
3955 PyErr_BadArgument();
3956 return NULL;
3957 }
3958 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3959 PyUnicode_GET_SIZE(unicode),
3960 NULL);
3961}
3962
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003963#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003964
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003965/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003966
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003967#if SIZEOF_INT < SIZEOF_SSIZE_T
3968#define NEED_RETRY
3969#endif
3970
3971/* XXX This code is limited to "true" double-byte encodings, as
3972 a) it assumes an incomplete character consists of a single byte, and
3973 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3974 encodings, see IsDBCSLeadByteEx documentation. */
3975
3976static int is_dbcs_lead_byte(const char *s, int offset)
3977{
3978 const char *curr = s + offset;
3979
3980 if (IsDBCSLeadByte(*curr)) {
3981 const char *prev = CharPrev(s, curr);
3982 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3983 }
3984 return 0;
3985}
3986
3987/*
3988 * Decode MBCS string into unicode object. If 'final' is set, converts
3989 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3990 */
3991static int decode_mbcs(PyUnicodeObject **v,
3992 const char *s, /* MBCS string */
3993 int size, /* sizeof MBCS string */
3994 int final)
3995{
3996 Py_UNICODE *p;
3997 Py_ssize_t n = 0;
3998 int usize = 0;
3999
4000 assert(size >= 0);
4001
4002 /* Skip trailing lead-byte unless 'final' is set */
4003 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4004 --size;
4005
4006 /* First get the size of the result */
4007 if (size > 0) {
4008 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4009 if (usize == 0) {
4010 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4011 return -1;
4012 }
4013 }
4014
4015 if (*v == NULL) {
4016 /* Create unicode object */
4017 *v = _PyUnicode_New(usize);
4018 if (*v == NULL)
4019 return -1;
4020 }
4021 else {
4022 /* Extend unicode object */
4023 n = PyUnicode_GET_SIZE(*v);
4024 if (_PyUnicode_Resize(v, n + usize) < 0)
4025 return -1;
4026 }
4027
4028 /* Do the conversion */
4029 if (size > 0) {
4030 p = PyUnicode_AS_UNICODE(*v) + n;
4031 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033 return -1;
4034 }
4035 }
4036
4037 return size;
4038}
4039
4040PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4041 Py_ssize_t size,
4042 const char *errors,
4043 Py_ssize_t *consumed)
4044{
4045 PyUnicodeObject *v = NULL;
4046 int done;
4047
4048 if (consumed)
4049 *consumed = 0;
4050
4051#ifdef NEED_RETRY
4052 retry:
4053 if (size > INT_MAX)
4054 done = decode_mbcs(&v, s, INT_MAX, 0);
4055 else
4056#endif
4057 done = decode_mbcs(&v, s, (int)size, !consumed);
4058
4059 if (done < 0) {
4060 Py_XDECREF(v);
4061 return NULL;
4062 }
4063
4064 if (consumed)
4065 *consumed += done;
4066
4067#ifdef NEED_RETRY
4068 if (size > INT_MAX) {
4069 s += done;
4070 size -= done;
4071 goto retry;
4072 }
4073#endif
4074
4075 return (PyObject *)v;
4076}
4077
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004079 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080 const char *errors)
4081{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004082 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4083}
4084
4085/*
4086 * Convert unicode into string object (MBCS).
4087 * Returns 0 if succeed, -1 otherwise.
4088 */
4089static int encode_mbcs(PyObject **repr,
4090 const Py_UNICODE *p, /* unicode */
4091 int size) /* size of unicode */
4092{
4093 int mbcssize = 0;
4094 Py_ssize_t n = 0;
4095
4096 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004097
4098 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004099 if (size > 0) {
4100 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4101 if (mbcssize == 0) {
4102 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4103 return -1;
4104 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004105 }
4106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004107 if (*repr == NULL) {
4108 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004109 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004110 if (*repr == NULL)
4111 return -1;
4112 }
4113 else {
4114 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004115 n = PyBytes_Size(*repr);
4116 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004117 return -1;
4118 }
4119
4120 /* Do the conversion */
4121 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004122 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004123 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4124 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4125 return -1;
4126 }
4127 }
4128
4129 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004130}
4131
4132PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004134 const char *errors)
4135{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004136 PyObject *repr = NULL;
4137 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004138
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004139#ifdef NEED_RETRY
4140 retry:
4141 if (size > INT_MAX)
4142 ret = encode_mbcs(&repr, p, INT_MAX);
4143 else
4144#endif
4145 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004146
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004147 if (ret < 0) {
4148 Py_XDECREF(repr);
4149 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004150 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004151
4152#ifdef NEED_RETRY
4153 if (size > INT_MAX) {
4154 p += INT_MAX;
4155 size -= INT_MAX;
4156 goto retry;
4157 }
4158#endif
4159
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004160 return repr;
4161}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004162
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004163PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4164{
4165 if (!PyUnicode_Check(unicode)) {
4166 PyErr_BadArgument();
4167 return NULL;
4168 }
4169 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4170 PyUnicode_GET_SIZE(unicode),
4171 NULL);
4172}
4173
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004174#undef NEED_RETRY
4175
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004176#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178/* --- Character Mapping Codec -------------------------------------------- */
4179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004181 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 PyObject *mapping,
4183 const char *errors)
4184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t startinpos;
4187 Py_ssize_t endinpos;
4188 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 PyUnicodeObject *v;
4191 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004192 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 PyObject *errorHandler = NULL;
4194 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004195 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004196 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 /* Default to Latin-1 */
4199 if (mapping == NULL)
4200 return PyUnicode_DecodeLatin1(s, size, errors);
4201
4202 v = _PyUnicode_New(size);
4203 if (v == NULL)
4204 goto onError;
4205 if (size == 0)
4206 return (PyObject *)v;
4207 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004209 if (PyUnicode_CheckExact(mapping)) {
4210 mapstring = PyUnicode_AS_UNICODE(mapping);
4211 maplen = PyUnicode_GET_SIZE(mapping);
4212 while (s < e) {
4213 unsigned char ch = *s;
4214 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004216 if (ch < maplen)
4217 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004219 if (x == 0xfffe) {
4220 /* undefined mapping */
4221 outpos = p-PyUnicode_AS_UNICODE(v);
4222 startinpos = s-starts;
4223 endinpos = startinpos+1;
4224 if (unicode_decode_call_errorhandler(
4225 errors, &errorHandler,
4226 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004227 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004228 (PyObject **)&v, &outpos, &p)) {
4229 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004230 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004232 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004233 *p++ = x;
4234 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004236 }
4237 else {
4238 while (s < e) {
4239 unsigned char ch = *s;
4240 PyObject *w, *x;
4241
4242 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004243 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004244 if (w == NULL)
4245 goto onError;
4246 x = PyObject_GetItem(mapping, w);
4247 Py_DECREF(w);
4248 if (x == NULL) {
4249 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4250 /* No mapping found means: mapping is undefined. */
4251 PyErr_Clear();
4252 x = Py_None;
4253 Py_INCREF(x);
4254 } else
4255 goto onError;
4256 }
4257
4258 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004259 if (PyLong_Check(x)) {
4260 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004261 if (value < 0 || value > 65535) {
4262 PyErr_SetString(PyExc_TypeError,
4263 "character mapping must be in range(65536)");
4264 Py_DECREF(x);
4265 goto onError;
4266 }
4267 *p++ = (Py_UNICODE)value;
4268 }
4269 else if (x == Py_None) {
4270 /* undefined mapping */
4271 outpos = p-PyUnicode_AS_UNICODE(v);
4272 startinpos = s-starts;
4273 endinpos = startinpos+1;
4274 if (unicode_decode_call_errorhandler(
4275 errors, &errorHandler,
4276 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004277 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004278 (PyObject **)&v, &outpos, &p)) {
4279 Py_DECREF(x);
4280 goto onError;
4281 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004282 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004283 continue;
4284 }
4285 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004287
4288 if (targetsize == 1)
4289 /* 1-1 mapping */
4290 *p++ = *PyUnicode_AS_UNICODE(x);
4291
4292 else if (targetsize > 1) {
4293 /* 1-n mapping */
4294 if (targetsize > extrachars) {
4295 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004296 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4297 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004298 (targetsize << 2);
4299 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004300 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004301 if (_PyUnicode_Resize(&v,
4302 PyUnicode_GET_SIZE(v) + needed) < 0) {
4303 Py_DECREF(x);
4304 goto onError;
4305 }
4306 p = PyUnicode_AS_UNICODE(v) + oldpos;
4307 }
4308 Py_UNICODE_COPY(p,
4309 PyUnicode_AS_UNICODE(x),
4310 targetsize);
4311 p += targetsize;
4312 extrachars -= targetsize;
4313 }
4314 /* 1-0 mapping: skip the character */
4315 }
4316 else {
4317 /* wrong return value */
4318 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004319 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004320 Py_DECREF(x);
4321 goto onError;
4322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004324 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
4327 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 Py_XDECREF(errorHandler);
4331 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337 Py_XDECREF(v);
4338 return NULL;
4339}
4340
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341/* Charmap encoding: the lookup table */
4342
4343struct encoding_map{
4344 PyObject_HEAD
4345 unsigned char level1[32];
4346 int count2, count3;
4347 unsigned char level23[1];
4348};
4349
4350static PyObject*
4351encoding_map_size(PyObject *obj, PyObject* args)
4352{
4353 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004354 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 128*map->count3);
4356}
4357
4358static PyMethodDef encoding_map_methods[] = {
4359 {"size", encoding_map_size, METH_NOARGS,
4360 PyDoc_STR("Return the size (in bytes) of this object") },
4361 { 0 }
4362};
4363
4364static void
4365encoding_map_dealloc(PyObject* o)
4366{
4367 PyObject_FREE(o);
4368}
4369
4370static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004371 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004372 "EncodingMap", /*tp_name*/
4373 sizeof(struct encoding_map), /*tp_basicsize*/
4374 0, /*tp_itemsize*/
4375 /* methods */
4376 encoding_map_dealloc, /*tp_dealloc*/
4377 0, /*tp_print*/
4378 0, /*tp_getattr*/
4379 0, /*tp_setattr*/
4380 0, /*tp_compare*/
4381 0, /*tp_repr*/
4382 0, /*tp_as_number*/
4383 0, /*tp_as_sequence*/
4384 0, /*tp_as_mapping*/
4385 0, /*tp_hash*/
4386 0, /*tp_call*/
4387 0, /*tp_str*/
4388 0, /*tp_getattro*/
4389 0, /*tp_setattro*/
4390 0, /*tp_as_buffer*/
4391 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4392 0, /*tp_doc*/
4393 0, /*tp_traverse*/
4394 0, /*tp_clear*/
4395 0, /*tp_richcompare*/
4396 0, /*tp_weaklistoffset*/
4397 0, /*tp_iter*/
4398 0, /*tp_iternext*/
4399 encoding_map_methods, /*tp_methods*/
4400 0, /*tp_members*/
4401 0, /*tp_getset*/
4402 0, /*tp_base*/
4403 0, /*tp_dict*/
4404 0, /*tp_descr_get*/
4405 0, /*tp_descr_set*/
4406 0, /*tp_dictoffset*/
4407 0, /*tp_init*/
4408 0, /*tp_alloc*/
4409 0, /*tp_new*/
4410 0, /*tp_free*/
4411 0, /*tp_is_gc*/
4412};
4413
4414PyObject*
4415PyUnicode_BuildEncodingMap(PyObject* string)
4416{
4417 Py_UNICODE *decode;
4418 PyObject *result;
4419 struct encoding_map *mresult;
4420 int i;
4421 int need_dict = 0;
4422 unsigned char level1[32];
4423 unsigned char level2[512];
4424 unsigned char *mlevel1, *mlevel2, *mlevel3;
4425 int count2 = 0, count3 = 0;
4426
4427 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4428 PyErr_BadArgument();
4429 return NULL;
4430 }
4431 decode = PyUnicode_AS_UNICODE(string);
4432 memset(level1, 0xFF, sizeof level1);
4433 memset(level2, 0xFF, sizeof level2);
4434
4435 /* If there isn't a one-to-one mapping of NULL to \0,
4436 or if there are non-BMP characters, we need to use
4437 a mapping dictionary. */
4438 if (decode[0] != 0)
4439 need_dict = 1;
4440 for (i = 1; i < 256; i++) {
4441 int l1, l2;
4442 if (decode[i] == 0
4443 #ifdef Py_UNICODE_WIDE
4444 || decode[i] > 0xFFFF
4445 #endif
4446 ) {
4447 need_dict = 1;
4448 break;
4449 }
4450 if (decode[i] == 0xFFFE)
4451 /* unmapped character */
4452 continue;
4453 l1 = decode[i] >> 11;
4454 l2 = decode[i] >> 7;
4455 if (level1[l1] == 0xFF)
4456 level1[l1] = count2++;
4457 if (level2[l2] == 0xFF)
4458 level2[l2] = count3++;
4459 }
4460
4461 if (count2 >= 0xFF || count3 >= 0xFF)
4462 need_dict = 1;
4463
4464 if (need_dict) {
4465 PyObject *result = PyDict_New();
4466 PyObject *key, *value;
4467 if (!result)
4468 return NULL;
4469 for (i = 0; i < 256; i++) {
4470 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004471 key = PyLong_FromLong(decode[i]);
4472 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 if (!key || !value)
4474 goto failed1;
4475 if (PyDict_SetItem(result, key, value) == -1)
4476 goto failed1;
4477 Py_DECREF(key);
4478 Py_DECREF(value);
4479 }
4480 return result;
4481 failed1:
4482 Py_XDECREF(key);
4483 Py_XDECREF(value);
4484 Py_DECREF(result);
4485 return NULL;
4486 }
4487
4488 /* Create a three-level trie */
4489 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4490 16*count2 + 128*count3 - 1);
4491 if (!result)
4492 return PyErr_NoMemory();
4493 PyObject_Init(result, &EncodingMapType);
4494 mresult = (struct encoding_map*)result;
4495 mresult->count2 = count2;
4496 mresult->count3 = count3;
4497 mlevel1 = mresult->level1;
4498 mlevel2 = mresult->level23;
4499 mlevel3 = mresult->level23 + 16*count2;
4500 memcpy(mlevel1, level1, 32);
4501 memset(mlevel2, 0xFF, 16*count2);
4502 memset(mlevel3, 0, 128*count3);
4503 count3 = 0;
4504 for (i = 1; i < 256; i++) {
4505 int o1, o2, o3, i2, i3;
4506 if (decode[i] == 0xFFFE)
4507 /* unmapped character */
4508 continue;
4509 o1 = decode[i]>>11;
4510 o2 = (decode[i]>>7) & 0xF;
4511 i2 = 16*mlevel1[o1] + o2;
4512 if (mlevel2[i2] == 0xFF)
4513 mlevel2[i2] = count3++;
4514 o3 = decode[i] & 0x7F;
4515 i3 = 128*mlevel2[i2] + o3;
4516 mlevel3[i3] = i;
4517 }
4518 return result;
4519}
4520
4521static int
4522encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4523{
4524 struct encoding_map *map = (struct encoding_map*)mapping;
4525 int l1 = c>>11;
4526 int l2 = (c>>7) & 0xF;
4527 int l3 = c & 0x7F;
4528 int i;
4529
4530#ifdef Py_UNICODE_WIDE
4531 if (c > 0xFFFF) {
4532 return -1;
4533 }
4534#endif
4535 if (c == 0)
4536 return 0;
4537 /* level 1*/
4538 i = map->level1[l1];
4539 if (i == 0xFF) {
4540 return -1;
4541 }
4542 /* level 2*/
4543 i = map->level23[16*i+l2];
4544 if (i == 0xFF) {
4545 return -1;
4546 }
4547 /* level 3 */
4548 i = map->level23[16*map->count2 + 128*i + l3];
4549 if (i == 0) {
4550 return -1;
4551 }
4552 return i;
4553}
4554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555/* Lookup the character ch in the mapping. If the character
4556 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004557 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559{
Christian Heimes217cfd12007-12-02 14:31:20 +00004560 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *x;
4562
4563 if (w == NULL)
4564 return NULL;
4565 x = PyObject_GetItem(mapping, w);
4566 Py_DECREF(w);
4567 if (x == NULL) {
4568 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4569 /* No mapping found means: mapping is undefined. */
4570 PyErr_Clear();
4571 x = Py_None;
4572 Py_INCREF(x);
4573 return x;
4574 } else
4575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004577 else if (x == Py_None)
4578 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004579 else if (PyLong_Check(x)) {
4580 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 if (value < 0 || value > 255) {
4582 PyErr_SetString(PyExc_TypeError,
4583 "character mapping must be in range(256)");
4584 Py_DECREF(x);
4585 return NULL;
4586 }
4587 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004589 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004593 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004594 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004595 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 Py_DECREF(x);
4597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 }
4599}
4600
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004602charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004603{
Christian Heimes72b710a2008-05-26 13:28:38 +00004604 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004605 /* exponentially overallocate to minimize reallocations */
4606 if (requiredsize < 2*outsize)
4607 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004608 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004609 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004610 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004611}
4612
4613typedef enum charmapencode_result {
4614 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4615}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004617 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 space is available. Return a new reference to the object that
4619 was put in the output buffer, or Py_None, if the mapping was undefined
4620 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004621 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004623charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004624 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004626 PyObject *rep;
4627 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629
Christian Heimes90aa7642007-12-19 02:45:37 +00004630 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004631 int res = encoding_map_lookup(c, mapping);
4632 Py_ssize_t requiredsize = *outpos+1;
4633 if (res == -1)
4634 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004635 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004636 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004637 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004638 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004639 outstart[(*outpos)++] = (char)res;
4640 return enc_SUCCESS;
4641 }
4642
4643 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004645 return enc_EXCEPTION;
4646 else if (rep==Py_None) {
4647 Py_DECREF(rep);
4648 return enc_FAILED;
4649 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004650 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004652 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004653 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004655 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004657 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004658 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
4660 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004661 const char *repchars = PyBytes_AS_STRING(rep);
4662 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004663 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004664 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004665 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004667 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004669 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 memcpy(outstart + *outpos, repchars, repsize);
4671 *outpos += repsize;
4672 }
4673 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004674 Py_DECREF(rep);
4675 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676}
4677
4678/* handle an error in PyUnicode_EncodeCharmap
4679 Return 0 on success, -1 on error */
4680static
4681int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004682 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004684 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004685 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686{
4687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t repsize;
4689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_UNICODE *uni2;
4691 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 Py_ssize_t collstartpos = *inpos;
4693 Py_ssize_t collendpos = *inpos+1;
4694 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 char *encoding = "charmap";
4696 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 /* find all unencodable characters */
4700 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004701 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004702 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004703 int res = encoding_map_lookup(p[collendpos], mapping);
4704 if (res != -1)
4705 break;
4706 ++collendpos;
4707 continue;
4708 }
4709
4710 rep = charmapencode_lookup(p[collendpos], mapping);
4711 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004713 else if (rep!=Py_None) {
4714 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 break;
4716 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004717 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 ++collendpos;
4719 }
4720 /* cache callback name lookup
4721 * (if not done yet, i.e. it's the first error) */
4722 if (*known_errorHandler==-1) {
4723 if ((errors==NULL) || (!strcmp(errors, "strict")))
4724 *known_errorHandler = 1;
4725 else if (!strcmp(errors, "replace"))
4726 *known_errorHandler = 2;
4727 else if (!strcmp(errors, "ignore"))
4728 *known_errorHandler = 3;
4729 else if (!strcmp(errors, "xmlcharrefreplace"))
4730 *known_errorHandler = 4;
4731 else
4732 *known_errorHandler = 0;
4733 }
4734 switch (*known_errorHandler) {
4735 case 1: /* strict */
4736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 case 2: /* replace */
4739 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4740 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004741 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 return -1;
4743 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004744 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4746 return -1;
4747 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 }
4749 /* fall through */
4750 case 3: /* ignore */
4751 *inpos = collendpos;
4752 break;
4753 case 4: /* xmlcharrefreplace */
4754 /* generate replacement (temporarily (mis)uses p) */
4755 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4756 char buffer[2+29+1+1];
4757 char *cp;
4758 sprintf(buffer, "&#%d;", (int)p[collpos]);
4759 for (cp = buffer; *cp; ++cp) {
4760 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004761 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004763 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4765 return -1;
4766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 }
4768 }
4769 *inpos = collendpos;
4770 break;
4771 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004772 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 encoding, reason, p, size, exceptionObject,
4774 collstartpos, collendpos, &newpos);
4775 if (repunicode == NULL)
4776 return -1;
4777 /* generate replacement */
4778 repsize = PyUnicode_GET_SIZE(repunicode);
4779 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4780 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004781 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 return -1;
4783 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004784 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4787 return -1;
4788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 }
4790 *inpos = newpos;
4791 Py_DECREF(repunicode);
4792 }
4793 return 0;
4794}
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 PyObject *mapping,
4799 const char *errors)
4800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 /* output object */
4802 PyObject *res = NULL;
4803 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 PyObject *errorHandler = NULL;
4808 PyObject *exc = NULL;
4809 /* the following variable is used for caching string comparisons
4810 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4811 * 3=ignore, 4=xmlcharrefreplace */
4812 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
4814 /* Default to Latin-1 */
4815 if (mapping == NULL)
4816 return PyUnicode_EncodeLatin1(p, size, errors);
4817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 /* allocate enough for a simple encoding without
4819 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004820 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 if (res == NULL)
4822 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004823 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 while (inpos<size) {
4827 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004828 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004829 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004831 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 if (charmap_encoding_error(p, size, &inpos, mapping,
4833 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004834 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004835 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004836 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 else
4840 /* done with this character => adjust input position */
4841 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004845 if (respos<PyBytes_GET_SIZE(res))
4846 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 Py_XDECREF(exc);
4849 Py_XDECREF(errorHandler);
4850 return res;
4851
4852 onError:
4853 Py_XDECREF(res);
4854 Py_XDECREF(exc);
4855 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 return NULL;
4857}
4858
4859PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4860 PyObject *mapping)
4861{
4862 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4863 PyErr_BadArgument();
4864 return NULL;
4865 }
4866 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4867 PyUnicode_GET_SIZE(unicode),
4868 mapping,
4869 NULL);
4870}
4871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872/* create or adjust a UnicodeTranslateError */
4873static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 const Py_UNICODE *unicode, Py_ssize_t size,
4875 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 if (*exceptionObject == NULL) {
4879 *exceptionObject = PyUnicodeTranslateError_Create(
4880 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 }
4882 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4884 goto onError;
4885 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4886 goto onError;
4887 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4888 goto onError;
4889 return;
4890 onError:
4891 Py_DECREF(*exceptionObject);
4892 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 }
4894}
4895
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896/* raises a UnicodeTranslateError */
4897static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 const Py_UNICODE *unicode, Py_ssize_t size,
4899 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 const char *reason)
4901{
4902 make_translate_exception(exceptionObject,
4903 unicode, size, startpos, endpos, reason);
4904 if (*exceptionObject != NULL)
4905 PyCodec_StrictErrors(*exceptionObject);
4906}
4907
4908/* error handling callback helper:
4909 build arguments, call the callback and check the arguments,
4910 put the result into newpos and return the replacement string, which
4911 has to be freed by the caller */
4912static PyObject *unicode_translate_call_errorhandler(const char *errors,
4913 PyObject **errorHandler,
4914 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4916 Py_ssize_t startpos, Py_ssize_t endpos,
4917 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004919 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004921 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 PyObject *restuple;
4923 PyObject *resunicode;
4924
4925 if (*errorHandler == NULL) {
4926 *errorHandler = PyCodec_LookupError(errors);
4927 if (*errorHandler == NULL)
4928 return NULL;
4929 }
4930
4931 make_translate_exception(exceptionObject,
4932 unicode, size, startpos, endpos, reason);
4933 if (*exceptionObject == NULL)
4934 return NULL;
4935
4936 restuple = PyObject_CallFunctionObjArgs(
4937 *errorHandler, *exceptionObject, NULL);
4938 if (restuple == NULL)
4939 return NULL;
4940 if (!PyTuple_Check(restuple)) {
4941 PyErr_Format(PyExc_TypeError, &argparse[4]);
4942 Py_DECREF(restuple);
4943 return NULL;
4944 }
4945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004946 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 Py_DECREF(restuple);
4948 return NULL;
4949 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004950 if (i_newpos<0)
4951 *newpos = size+i_newpos;
4952 else
4953 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004954 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004956 Py_DECREF(restuple);
4957 return NULL;
4958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 Py_INCREF(resunicode);
4960 Py_DECREF(restuple);
4961 return resunicode;
4962}
4963
4964/* Lookup the character ch in the mapping and put the result in result,
4965 which must be decrefed by the caller.
4966 Return 0 on success, -1 on error */
4967static
4968int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4969{
Christian Heimes217cfd12007-12-02 14:31:20 +00004970 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 PyObject *x;
4972
4973 if (w == NULL)
4974 return -1;
4975 x = PyObject_GetItem(mapping, w);
4976 Py_DECREF(w);
4977 if (x == NULL) {
4978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4979 /* No mapping found means: use 1:1 mapping. */
4980 PyErr_Clear();
4981 *result = NULL;
4982 return 0;
4983 } else
4984 return -1;
4985 }
4986 else if (x == Py_None) {
4987 *result = x;
4988 return 0;
4989 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004990 else if (PyLong_Check(x)) {
4991 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 long max = PyUnicode_GetMax();
4993 if (value < 0 || value > max) {
4994 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004995 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 Py_DECREF(x);
4997 return -1;
4998 }
4999 *result = x;
5000 return 0;
5001 }
5002 else if (PyUnicode_Check(x)) {
5003 *result = x;
5004 return 0;
5005 }
5006 else {
5007 /* wrong return value */
5008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005009 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005010 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 return -1;
5012 }
5013}
5014/* ensure that *outobj is at least requiredsize characters long,
5015if not reallocate and adjust various state variables.
5016Return 0 on success, -1 on error */
5017static
Walter Dörwald4894c302003-10-24 14:25:28 +00005018int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005019 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005021 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005022 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005024 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005026 if (requiredsize < 2 * oldsize)
5027 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005028 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 return -1;
5030 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 }
5032 return 0;
5033}
5034/* lookup the character, put the result in the output string and adjust
5035 various state variables. Return a new reference to the object that
5036 was put in the output buffer in *result, or Py_None, if the mapping was
5037 undefined (in which case no character was written).
5038 The called must decref result.
5039 Return 0 on success, -1 on error. */
5040static
Walter Dörwald4894c302003-10-24 14:25:28 +00005041int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005043 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044{
Walter Dörwald4894c302003-10-24 14:25:28 +00005045 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 return -1;
5047 if (*res==NULL) {
5048 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005049 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 }
5051 else if (*res==Py_None)
5052 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005053 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005055 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 }
5057 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005058 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 if (repsize==1) {
5060 /* no overflow check, because we know that the space is enough */
5061 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5062 }
5063 else if (repsize!=0) {
5064 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005066 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005067 repsize - 1;
5068 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 return -1;
5070 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5071 *outp += repsize;
5072 }
5073 }
5074 else
5075 return -1;
5076 return 0;
5077}
5078
5079PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005080 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 PyObject *mapping,
5082 const char *errors)
5083{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 /* output object */
5085 PyObject *res = NULL;
5086 /* pointers to the beginning and end+1 of input */
5087 const Py_UNICODE *startp = p;
5088 const Py_UNICODE *endp = p + size;
5089 /* pointer into the output */
5090 Py_UNICODE *str;
5091 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 char *reason = "character maps to <undefined>";
5094 PyObject *errorHandler = NULL;
5095 PyObject *exc = NULL;
5096 /* the following variable is used for caching string comparisons
5097 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5098 * 3=ignore, 4=xmlcharrefreplace */
5099 int known_errorHandler = -1;
5100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (mapping == NULL) {
5102 PyErr_BadArgument();
5103 return NULL;
5104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105
5106 /* allocate enough for a simple 1:1 translation without
5107 replacements, if we need more, we'll resize */
5108 res = PyUnicode_FromUnicode(NULL, size);
5109 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005110 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 return res;
5113 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 while (p<endp) {
5116 /* try to encode it */
5117 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005118 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 goto onError;
5121 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005122 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 if (x!=Py_None) /* it worked => adjust input pointer */
5124 ++p;
5125 else { /* untranslatable character */
5126 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005127 Py_ssize_t repsize;
5128 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 Py_UNICODE *uni2;
5130 /* startpos for collecting untranslatable chars */
5131 const Py_UNICODE *collstart = p;
5132 const Py_UNICODE *collend = p+1;
5133 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 /* find all untranslatable characters */
5136 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005137 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 goto onError;
5139 Py_XDECREF(x);
5140 if (x!=Py_None)
5141 break;
5142 ++collend;
5143 }
5144 /* cache callback name lookup
5145 * (if not done yet, i.e. it's the first error) */
5146 if (known_errorHandler==-1) {
5147 if ((errors==NULL) || (!strcmp(errors, "strict")))
5148 known_errorHandler = 1;
5149 else if (!strcmp(errors, "replace"))
5150 known_errorHandler = 2;
5151 else if (!strcmp(errors, "ignore"))
5152 known_errorHandler = 3;
5153 else if (!strcmp(errors, "xmlcharrefreplace"))
5154 known_errorHandler = 4;
5155 else
5156 known_errorHandler = 0;
5157 }
5158 switch (known_errorHandler) {
5159 case 1: /* strict */
5160 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5161 goto onError;
5162 case 2: /* replace */
5163 /* No need to check for space, this is a 1:1 replacement */
5164 for (coll = collstart; coll<collend; ++coll)
5165 *str++ = '?';
5166 /* fall through */
5167 case 3: /* ignore */
5168 p = collend;
5169 break;
5170 case 4: /* xmlcharrefreplace */
5171 /* generate replacement (temporarily (mis)uses p) */
5172 for (p = collstart; p < collend; ++p) {
5173 char buffer[2+29+1+1];
5174 char *cp;
5175 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005176 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5178 goto onError;
5179 for (cp = buffer; *cp; ++cp)
5180 *str++ = *cp;
5181 }
5182 p = collend;
5183 break;
5184 default:
5185 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5186 reason, startp, size, &exc,
5187 collstart-startp, collend-startp, &newpos);
5188 if (repunicode == NULL)
5189 goto onError;
5190 /* generate replacement */
5191 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005192 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5194 Py_DECREF(repunicode);
5195 goto onError;
5196 }
5197 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5198 *str++ = *uni2;
5199 p = startp + newpos;
5200 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 }
5202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 /* Resize if we allocated to much */
5205 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005206 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005207 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 }
5210 Py_XDECREF(exc);
5211 Py_XDECREF(errorHandler);
5212 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 onError:
5215 Py_XDECREF(res);
5216 Py_XDECREF(exc);
5217 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 return NULL;
5219}
5220
5221PyObject *PyUnicode_Translate(PyObject *str,
5222 PyObject *mapping,
5223 const char *errors)
5224{
5225 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005226
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 str = PyUnicode_FromObject(str);
5228 if (str == NULL)
5229 goto onError;
5230 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5231 PyUnicode_GET_SIZE(str),
5232 mapping,
5233 errors);
5234 Py_DECREF(str);
5235 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 onError:
5238 Py_XDECREF(str);
5239 return NULL;
5240}
Tim Petersced69f82003-09-16 20:30:58 +00005241
Guido van Rossum9e896b32000-04-05 20:11:21 +00005242/* --- Decimal Encoder ---------------------------------------------------- */
5243
5244int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005246 char *output,
5247 const char *errors)
5248{
5249 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
5252 const char *encoding = "decimal";
5253 const char *reason = "invalid decimal Unicode string";
5254 /* the following variable is used for caching string comparisons
5255 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5256 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005257
5258 if (output == NULL) {
5259 PyErr_BadArgument();
5260 return -1;
5261 }
5262
5263 p = s;
5264 end = s + length;
5265 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005267 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 Py_ssize_t repsize;
5270 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 Py_UNICODE *uni2;
5272 Py_UNICODE *collstart;
5273 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005274
Guido van Rossum9e896b32000-04-05 20:11:21 +00005275 if (Py_UNICODE_ISSPACE(ch)) {
5276 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005278 continue;
5279 }
5280 decimal = Py_UNICODE_TODECIMAL(ch);
5281 if (decimal >= 0) {
5282 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005284 continue;
5285 }
Guido van Rossumba477042000-04-06 18:18:10 +00005286 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005287 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005288 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005289 continue;
5290 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005291 /* All other characters are considered unencodable */
5292 collstart = p;
5293 collend = p+1;
5294 while (collend < end) {
5295 if ((0 < *collend && *collend < 256) ||
5296 !Py_UNICODE_ISSPACE(*collend) ||
5297 Py_UNICODE_TODECIMAL(*collend))
5298 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300 /* cache callback name lookup
5301 * (if not done yet, i.e. it's the first error) */
5302 if (known_errorHandler==-1) {
5303 if ((errors==NULL) || (!strcmp(errors, "strict")))
5304 known_errorHandler = 1;
5305 else if (!strcmp(errors, "replace"))
5306 known_errorHandler = 2;
5307 else if (!strcmp(errors, "ignore"))
5308 known_errorHandler = 3;
5309 else if (!strcmp(errors, "xmlcharrefreplace"))
5310 known_errorHandler = 4;
5311 else
5312 known_errorHandler = 0;
5313 }
5314 switch (known_errorHandler) {
5315 case 1: /* strict */
5316 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5317 goto onError;
5318 case 2: /* replace */
5319 for (p = collstart; p < collend; ++p)
5320 *output++ = '?';
5321 /* fall through */
5322 case 3: /* ignore */
5323 p = collend;
5324 break;
5325 case 4: /* xmlcharrefreplace */
5326 /* generate replacement (temporarily (mis)uses p) */
5327 for (p = collstart; p < collend; ++p)
5328 output += sprintf(output, "&#%d;", (int)*p);
5329 p = collend;
5330 break;
5331 default:
5332 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5333 encoding, reason, s, length, &exc,
5334 collstart-s, collend-s, &newpos);
5335 if (repunicode == NULL)
5336 goto onError;
5337 /* generate replacement */
5338 repsize = PyUnicode_GET_SIZE(repunicode);
5339 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5340 Py_UNICODE ch = *uni2;
5341 if (Py_UNICODE_ISSPACE(ch))
5342 *output++ = ' ';
5343 else {
5344 decimal = Py_UNICODE_TODECIMAL(ch);
5345 if (decimal >= 0)
5346 *output++ = '0' + decimal;
5347 else if (0 < ch && ch < 256)
5348 *output++ = (char)ch;
5349 else {
5350 Py_DECREF(repunicode);
5351 raise_encode_exception(&exc, encoding,
5352 s, length, collstart-s, collend-s, reason);
5353 goto onError;
5354 }
5355 }
5356 }
5357 p = s + newpos;
5358 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005359 }
5360 }
5361 /* 0-terminate the output string */
5362 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 Py_XDECREF(exc);
5364 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005365 return 0;
5366
5367 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 Py_XDECREF(exc);
5369 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005370 return -1;
5371}
5372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373/* --- Helpers ------------------------------------------------------------ */
5374
Eric Smith8c663262007-08-25 02:26:07 +00005375#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005376#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005377#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005378/* Include _ParseTupleFinds from find.h */
5379#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005380#include "stringlib/find.h"
5381#include "stringlib/partition.h"
5382
Eric Smith5807c412008-05-11 21:00:57 +00005383#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5384#include "stringlib/localeutil.h"
5385
Thomas Wouters477c8d52006-05-27 19:21:47 +00005386/* helper macro to fixup start/end slice values */
5387#define FIX_START_END(obj) \
5388 if (start < 0) \
5389 start += (obj)->length; \
5390 if (start < 0) \
5391 start = 0; \
5392 if (end > (obj)->length) \
5393 end = (obj)->length; \
5394 if (end < 0) \
5395 end += (obj)->length; \
5396 if (end < 0) \
5397 end = 0;
5398
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005400 PyObject *substr,
5401 Py_ssize_t start,
5402 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005405 PyUnicodeObject* str_obj;
5406 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005407
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5409 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005411 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5412 if (!sub_obj) {
5413 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 return -1;
5415 }
Tim Petersced69f82003-09-16 20:30:58 +00005416
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005418
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 result = stringlib_count(
5420 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5421 );
5422
5423 Py_DECREF(sub_obj);
5424 Py_DECREF(str_obj);
5425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 return result;
5427}
5428
Martin v. Löwis18e16552006-02-15 17:27:45 +00005429Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005430 PyObject *sub,
5431 Py_ssize_t start,
5432 Py_ssize_t end,
5433 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005438 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005439 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 sub = PyUnicode_FromObject(sub);
5441 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005442 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005443 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 }
Tim Petersced69f82003-09-16 20:30:58 +00005445
Thomas Wouters477c8d52006-05-27 19:21:47 +00005446 if (direction > 0)
5447 result = stringlib_find_slice(
5448 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5449 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5450 start, end
5451 );
5452 else
5453 result = stringlib_rfind_slice(
5454 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5455 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5456 start, end
5457 );
5458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005460 Py_DECREF(sub);
5461
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return result;
5463}
5464
Tim Petersced69f82003-09-16 20:30:58 +00005465static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466int tailmatch(PyUnicodeObject *self,
5467 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t start,
5469 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 int direction)
5471{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (substring->length == 0)
5473 return 1;
5474
Thomas Wouters477c8d52006-05-27 19:21:47 +00005475 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477 end -= substring->length;
5478 if (end < start)
5479 return 0;
5480
5481 if (direction > 0) {
5482 if (Py_UNICODE_MATCH(self, end, substring))
5483 return 1;
5484 } else {
5485 if (Py_UNICODE_MATCH(self, start, substring))
5486 return 1;
5487 }
5488
5489 return 0;
5490}
5491
Martin v. Löwis18e16552006-02-15 17:27:45 +00005492Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005494 Py_ssize_t start,
5495 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 int direction)
5497{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005499
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 str = PyUnicode_FromObject(str);
5501 if (str == NULL)
5502 return -1;
5503 substr = PyUnicode_FromObject(substr);
5504 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005505 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 return -1;
5507 }
Tim Petersced69f82003-09-16 20:30:58 +00005508
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 result = tailmatch((PyUnicodeObject *)str,
5510 (PyUnicodeObject *)substr,
5511 start, end, direction);
5512 Py_DECREF(str);
5513 Py_DECREF(substr);
5514 return result;
5515}
5516
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517/* Apply fixfct filter to the Unicode object self and return a
5518 reference to the modified object */
5519
Tim Petersced69f82003-09-16 20:30:58 +00005520static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521PyObject *fixup(PyUnicodeObject *self,
5522 int (*fixfct)(PyUnicodeObject *s))
5523{
5524
5525 PyUnicodeObject *u;
5526
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005527 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 if (u == NULL)
5529 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005530
5531 Py_UNICODE_COPY(u->str, self->str, self->length);
5532
Tim Peters7a29bd52001-09-12 03:03:31 +00005533 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 /* fixfct should return TRUE if it modified the buffer. If
5535 FALSE, return a reference to the original buffer instead
5536 (to save space, not time) */
5537 Py_INCREF(self);
5538 Py_DECREF(u);
5539 return (PyObject*) self;
5540 }
5541 return (PyObject*) u;
5542}
5543
Tim Petersced69f82003-09-16 20:30:58 +00005544static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545int fixupper(PyUnicodeObject *self)
5546{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 Py_UNICODE *s = self->str;
5549 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005550
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 while (len-- > 0) {
5552 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 ch = Py_UNICODE_TOUPPER(*s);
5555 if (ch != *s) {
5556 status = 1;
5557 *s = ch;
5558 }
5559 s++;
5560 }
5561
5562 return status;
5563}
5564
Tim Petersced69f82003-09-16 20:30:58 +00005565static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566int fixlower(PyUnicodeObject *self)
5567{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005568 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 Py_UNICODE *s = self->str;
5570 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 while (len-- > 0) {
5573 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 ch = Py_UNICODE_TOLOWER(*s);
5576 if (ch != *s) {
5577 status = 1;
5578 *s = ch;
5579 }
5580 s++;
5581 }
5582
5583 return status;
5584}
5585
Tim Petersced69f82003-09-16 20:30:58 +00005586static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587int fixswapcase(PyUnicodeObject *self)
5588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 Py_UNICODE *s = self->str;
5591 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005592
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 while (len-- > 0) {
5594 if (Py_UNICODE_ISUPPER(*s)) {
5595 *s = Py_UNICODE_TOLOWER(*s);
5596 status = 1;
5597 } else if (Py_UNICODE_ISLOWER(*s)) {
5598 *s = Py_UNICODE_TOUPPER(*s);
5599 status = 1;
5600 }
5601 s++;
5602 }
5603
5604 return status;
5605}
5606
Tim Petersced69f82003-09-16 20:30:58 +00005607static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608int fixcapitalize(PyUnicodeObject *self)
5609{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005611 Py_UNICODE *s = self->str;
5612 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005613
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005614 if (len == 0)
5615 return 0;
5616 if (Py_UNICODE_ISLOWER(*s)) {
5617 *s = Py_UNICODE_TOUPPER(*s);
5618 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005620 s++;
5621 while (--len > 0) {
5622 if (Py_UNICODE_ISUPPER(*s)) {
5623 *s = Py_UNICODE_TOLOWER(*s);
5624 status = 1;
5625 }
5626 s++;
5627 }
5628 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629}
5630
5631static
5632int fixtitle(PyUnicodeObject *self)
5633{
5634 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5635 register Py_UNICODE *e;
5636 int previous_is_cased;
5637
5638 /* Shortcut for single character strings */
5639 if (PyUnicode_GET_SIZE(self) == 1) {
5640 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5641 if (*p != ch) {
5642 *p = ch;
5643 return 1;
5644 }
5645 else
5646 return 0;
5647 }
Tim Petersced69f82003-09-16 20:30:58 +00005648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 e = p + PyUnicode_GET_SIZE(self);
5650 previous_is_cased = 0;
5651 for (; p < e; p++) {
5652 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (previous_is_cased)
5655 *p = Py_UNICODE_TOLOWER(ch);
5656 else
5657 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005658
5659 if (Py_UNICODE_ISLOWER(ch) ||
5660 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 Py_UNICODE_ISTITLE(ch))
5662 previous_is_cased = 1;
5663 else
5664 previous_is_cased = 0;
5665 }
5666 return 1;
5667}
5668
Tim Peters8ce9f162004-08-27 01:49:32 +00005669PyObject *
5670PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671{
Skip Montanaro6543b452004-09-16 03:28:13 +00005672 const Py_UNICODE blank = ' ';
5673 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005674 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005675 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5677 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005678 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5679 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005681 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 fseq = PySequence_Fast(seq, "");
5684 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 }
5687
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005688 /* NOTE: the following code can't call back into Python code,
5689 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005690 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005691
Tim Peters05eba1f2004-08-27 21:32:02 +00005692 seqlen = PySequence_Fast_GET_SIZE(fseq);
5693 /* If empty sequence, return u"". */
5694 if (seqlen == 0) {
5695 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5696 goto Done;
5697 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005698 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005699 /* If singleton sequence with an exact Unicode, return that. */
5700 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005701 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 if (PyUnicode_CheckExact(item)) {
5703 Py_INCREF(item);
5704 res = (PyUnicodeObject *)item;
5705 goto Done;
5706 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005707 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005708 else {
5709 /* Set up sep and seplen */
5710 if (separator == NULL) {
5711 sep = &blank;
5712 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005713 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005714 else {
5715 if (!PyUnicode_Check(separator)) {
5716 PyErr_Format(PyExc_TypeError,
5717 "separator: expected str instance,"
5718 " %.80s found",
5719 Py_TYPE(separator)->tp_name);
5720 goto onError;
5721 }
5722 sep = PyUnicode_AS_UNICODE(separator);
5723 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005724 }
5725 }
5726
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005727 /* There are at least two things to join, or else we have a subclass
5728 * of str in the sequence.
5729 * Do a pre-pass to figure out the total amount of space we'll
5730 * need (sz), and see whether all argument are strings.
5731 */
5732 sz = 0;
5733 for (i = 0; i < seqlen; i++) {
5734 const Py_ssize_t old_sz = sz;
5735 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005736 if (!PyUnicode_Check(item)) {
5737 PyErr_Format(PyExc_TypeError,
5738 "sequence item %zd: expected str instance,"
5739 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005740 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005741 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005742 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005743 sz += PyUnicode_GET_SIZE(item);
5744 if (i != 0)
5745 sz += seplen;
5746 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5747 PyErr_SetString(PyExc_OverflowError,
5748 "join() result is too long for a Python string");
5749 goto onError;
5750 }
5751 }
Tim Petersced69f82003-09-16 20:30:58 +00005752
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005753 res = _PyUnicode_New(sz);
5754 if (res == NULL)
5755 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005757 /* Catenate everything. */
5758 res_p = PyUnicode_AS_UNICODE(res);
5759 for (i = 0; i < seqlen; ++i) {
5760 Py_ssize_t itemlen;
5761 item = items[i];
5762 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005763 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005764 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005765 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005766 res_p += seplen;
5767 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005768 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5769 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005770 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005771
Tim Peters8ce9f162004-08-27 01:49:32 +00005772 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005773 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 return (PyObject *)res;
5775
5776 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005777 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005778 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780}
5781
Tim Petersced69f82003-09-16 20:30:58 +00005782static
5783PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 Py_ssize_t left,
5785 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 Py_UNICODE fill)
5787{
5788 PyUnicodeObject *u;
5789
5790 if (left < 0)
5791 left = 0;
5792 if (right < 0)
5793 right = 0;
5794
Tim Peters7a29bd52001-09-12 03:03:31 +00005795 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 Py_INCREF(self);
5797 return self;
5798 }
5799
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005800 if (left > PY_SSIZE_T_MAX - self->length ||
5801 right > PY_SSIZE_T_MAX - (left + self->length)) {
5802 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5803 return NULL;
5804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 u = _PyUnicode_New(left + self->length + right);
5806 if (u) {
5807 if (left)
5808 Py_UNICODE_FILL(u->str, fill, left);
5809 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5810 if (right)
5811 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5812 }
5813
5814 return u;
5815}
5816
5817#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 if (!str) \
5820 goto onError; \
5821 if (PyList_Append(list, str)) { \
5822 Py_DECREF(str); \
5823 goto onError; \
5824 } \
5825 else \
5826 Py_DECREF(str);
5827
5828static
5829PyObject *split_whitespace(PyUnicodeObject *self,
5830 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005837 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 for (i = j = 0; i < len; ) {
5840 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005841 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 i++;
5843 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005844 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 i++;
5846 if (j < i) {
5847 if (maxcount-- <= 0)
5848 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005849 SPLIT_APPEND(buf, j, i);
5850 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 i++;
5852 j = i;
5853 }
5854 }
5855 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005856 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 }
5858 return list;
5859
5860 onError:
5861 Py_DECREF(list);
5862 return NULL;
5863}
5864
5865PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005866 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 PyObject *list;
5872 PyObject *str;
5873 Py_UNICODE *data;
5874
5875 string = PyUnicode_FromObject(string);
5876 if (string == NULL)
5877 return NULL;
5878 data = PyUnicode_AS_UNICODE(string);
5879 len = PyUnicode_GET_SIZE(string);
5880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 list = PyList_New(0);
5882 if (!list)
5883 goto onError;
5884
5885 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005886 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005893 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 if (i < len) {
5895 if (data[i] == '\r' && i + 1 < len &&
5896 data[i+1] == '\n')
5897 i += 2;
5898 else
5899 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005900 if (keepends)
5901 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
Guido van Rossum86662912000-04-11 15:38:46 +00005903 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 j = i;
5905 }
5906 if (j < len) {
5907 SPLIT_APPEND(data, j, len);
5908 }
5909
5910 Py_DECREF(string);
5911 return list;
5912
5913 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005914 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 Py_DECREF(string);
5916 return NULL;
5917}
5918
Tim Petersced69f82003-09-16 20:30:58 +00005919static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920PyObject *split_char(PyUnicodeObject *self,
5921 PyObject *list,
5922 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 register Py_ssize_t i;
5926 register Py_ssize_t j;
5927 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005929 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
5931 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005932 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 if (maxcount-- <= 0)
5934 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005935 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 i = j = i + 1;
5937 } else
5938 i++;
5939 }
5940 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005941 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
5943 return list;
5944
5945 onError:
5946 Py_DECREF(list);
5947 return NULL;
5948}
5949
Tim Petersced69f82003-09-16 20:30:58 +00005950static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951PyObject *split_substring(PyUnicodeObject *self,
5952 PyObject *list,
5953 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 register Py_ssize_t i;
5957 register Py_ssize_t j;
5958 Py_ssize_t len = self->length;
5959 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 PyObject *str;
5961
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005962 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 if (Py_UNICODE_MATCH(self, i, substring)) {
5964 if (maxcount-- <= 0)
5965 break;
5966 SPLIT_APPEND(self->str, j, i);
5967 i = j = i + sublen;
5968 } else
5969 i++;
5970 }
5971 if (j <= len) {
5972 SPLIT_APPEND(self->str, j, len);
5973 }
5974 return list;
5975
5976 onError:
5977 Py_DECREF(list);
5978 return NULL;
5979}
5980
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005981static
5982PyObject *rsplit_whitespace(PyUnicodeObject *self,
5983 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005984 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005986 register Py_ssize_t i;
5987 register Py_ssize_t j;
5988 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005989 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005990 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991
5992 for (i = j = len - 1; i >= 0; ) {
5993 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005994 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005995 i--;
5996 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005997 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005998 i--;
5999 if (j > i) {
6000 if (maxcount-- <= 0)
6001 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006002 SPLIT_APPEND(buf, i + 1, j + 1);
6003 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004 i--;
6005 j = i;
6006 }
6007 }
6008 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006009 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 if (PyList_Reverse(list) < 0)
6012 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006013 return list;
6014
6015 onError:
6016 Py_DECREF(list);
6017 return NULL;
6018}
6019
6020static
6021PyObject *rsplit_char(PyUnicodeObject *self,
6022 PyObject *list,
6023 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006025{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 register Py_ssize_t i;
6027 register Py_ssize_t j;
6028 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006029 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006030 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006031
6032 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006033 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006034 if (maxcount-- <= 0)
6035 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006036 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006037 j = i = i - 1;
6038 } else
6039 i--;
6040 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006041 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006042 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006043 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006044 if (PyList_Reverse(list) < 0)
6045 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006046 return list;
6047
6048 onError:
6049 Py_DECREF(list);
6050 return NULL;
6051}
6052
6053static
6054PyObject *rsplit_substring(PyUnicodeObject *self,
6055 PyObject *list,
6056 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 register Py_ssize_t i;
6060 register Py_ssize_t j;
6061 Py_ssize_t len = self->length;
6062 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006063 PyObject *str;
6064
6065 for (i = len - sublen, j = len; i >= 0; ) {
6066 if (Py_UNICODE_MATCH(self, i, substring)) {
6067 if (maxcount-- <= 0)
6068 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006070 j = i;
6071 i -= sublen;
6072 } else
6073 i--;
6074 }
6075 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006077 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078 if (PyList_Reverse(list) < 0)
6079 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006080 return list;
6081
6082 onError:
6083 Py_DECREF(list);
6084 return NULL;
6085}
6086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087#undef SPLIT_APPEND
6088
6089static
6090PyObject *split(PyUnicodeObject *self,
6091 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
6094 PyObject *list;
6095
6096 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006097 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099 list = PyList_New(0);
6100 if (!list)
6101 return NULL;
6102
6103 if (substring == NULL)
6104 return split_whitespace(self,list,maxcount);
6105
6106 else if (substring->length == 1)
6107 return split_char(self,list,substring->str[0],maxcount);
6108
6109 else if (substring->length == 0) {
6110 Py_DECREF(list);
6111 PyErr_SetString(PyExc_ValueError, "empty separator");
6112 return NULL;
6113 }
6114 else
6115 return split_substring(self,list,substring,maxcount);
6116}
6117
Tim Petersced69f82003-09-16 20:30:58 +00006118static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006119PyObject *rsplit(PyUnicodeObject *self,
6120 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006122{
6123 PyObject *list;
6124
6125 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006126 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006127
6128 list = PyList_New(0);
6129 if (!list)
6130 return NULL;
6131
6132 if (substring == NULL)
6133 return rsplit_whitespace(self,list,maxcount);
6134
6135 else if (substring->length == 1)
6136 return rsplit_char(self,list,substring->str[0],maxcount);
6137
6138 else if (substring->length == 0) {
6139 Py_DECREF(list);
6140 PyErr_SetString(PyExc_ValueError, "empty separator");
6141 return NULL;
6142 }
6143 else
6144 return rsplit_substring(self,list,substring,maxcount);
6145}
6146
6147static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148PyObject *replace(PyUnicodeObject *self,
6149 PyUnicodeObject *str1,
6150 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
6153 PyUnicodeObject *u;
6154
6155 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006156 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Thomas Wouters477c8d52006-05-27 19:21:47 +00006158 if (str1->length == str2->length) {
6159 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006160 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006161 if (str1->length == 1) {
6162 /* replace characters */
6163 Py_UNICODE u1, u2;
6164 if (!findchar(self->str, self->length, str1->str[0]))
6165 goto nothing;
6166 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6167 if (!u)
6168 return NULL;
6169 Py_UNICODE_COPY(u->str, self->str, self->length);
6170 u1 = str1->str[0];
6171 u2 = str2->str[0];
6172 for (i = 0; i < u->length; i++)
6173 if (u->str[i] == u1) {
6174 if (--maxcount < 0)
6175 break;
6176 u->str[i] = u2;
6177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179 i = fastsearch(
6180 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006182 if (i < 0)
6183 goto nothing;
6184 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6185 if (!u)
6186 return NULL;
6187 Py_UNICODE_COPY(u->str, self->str, self->length);
6188 while (i <= self->length - str1->length)
6189 if (Py_UNICODE_MATCH(self, i, str1)) {
6190 if (--maxcount < 0)
6191 break;
6192 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6193 i += str1->length;
6194 } else
6195 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006198
6199 Py_ssize_t n, i, j, e;
6200 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 Py_UNICODE *p;
6202
6203 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006204 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 if (n > maxcount)
6206 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006207 if (n == 0)
6208 goto nothing;
6209 /* new_size = self->length + n * (str2->length - str1->length)); */
6210 delta = (str2->length - str1->length);
6211 if (delta == 0) {
6212 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006214 product = n * (str2->length - str1->length);
6215 if ((product / (str2->length - str1->length)) != n) {
6216 PyErr_SetString(PyExc_OverflowError,
6217 "replace string is too long");
6218 return NULL;
6219 }
6220 new_size = self->length + product;
6221 if (new_size < 0) {
6222 PyErr_SetString(PyExc_OverflowError,
6223 "replace string is too long");
6224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
6226 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006227 u = _PyUnicode_New(new_size);
6228 if (!u)
6229 return NULL;
6230 i = 0;
6231 p = u->str;
6232 e = self->length - str1->length;
6233 if (str1->length > 0) {
6234 while (n-- > 0) {
6235 /* look for next match */
6236 j = i;
6237 while (j <= e) {
6238 if (Py_UNICODE_MATCH(self, j, str1))
6239 break;
6240 j++;
6241 }
6242 if (j > i) {
6243 if (j > e)
6244 break;
6245 /* copy unchanged part [i:j] */
6246 Py_UNICODE_COPY(p, self->str+i, j-i);
6247 p += j - i;
6248 }
6249 /* copy substitution string */
6250 if (str2->length > 0) {
6251 Py_UNICODE_COPY(p, str2->str, str2->length);
6252 p += str2->length;
6253 }
6254 i = j + str1->length;
6255 }
6256 if (i < self->length)
6257 /* copy tail [i:] */
6258 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6259 } else {
6260 /* interleave */
6261 while (n > 0) {
6262 Py_UNICODE_COPY(p, str2->str, str2->length);
6263 p += str2->length;
6264 if (--n <= 0)
6265 break;
6266 *p++ = self->str[i++];
6267 }
6268 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006272
6273nothing:
6274 /* nothing to replace; return original string (when possible) */
6275 if (PyUnicode_CheckExact(self)) {
6276 Py_INCREF(self);
6277 return (PyObject *) self;
6278 }
6279 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Unicode Object Methods --------------------------------------------- */
6283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006285"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286\n\
6287Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006291unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 return fixup(self, fixtitle);
6294}
6295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006297"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298\n\
6299Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006303unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 return fixup(self, fixcapitalize);
6306}
6307
6308#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006309PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006310"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311\n\
6312Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
6315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006316unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317{
6318 PyObject *list;
6319 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 /* Split into words */
6323 list = split(self, NULL, -1);
6324 if (!list)
6325 return NULL;
6326
6327 /* Capitalize each word */
6328 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6329 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6330 fixcapitalize);
6331 if (item == NULL)
6332 goto onError;
6333 Py_DECREF(PyList_GET_ITEM(list, i));
6334 PyList_SET_ITEM(list, i, item);
6335 }
6336
6337 /* Join the words to form a new string */
6338 item = PyUnicode_Join(NULL, list);
6339
6340onError:
6341 Py_DECREF(list);
6342 return (PyObject *)item;
6343}
6344#endif
6345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006346/* Argument converter. Coerces to a single unicode character */
6347
6348static int
6349convert_uc(PyObject *obj, void *addr)
6350{
6351 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6352 PyObject *uniobj;
6353 Py_UNICODE *unistr;
6354
6355 uniobj = PyUnicode_FromObject(obj);
6356 if (uniobj == NULL) {
6357 PyErr_SetString(PyExc_TypeError,
6358 "The fill character cannot be converted to Unicode");
6359 return 0;
6360 }
6361 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6362 PyErr_SetString(PyExc_TypeError,
6363 "The fill character must be exactly one character long");
6364 Py_DECREF(uniobj);
6365 return 0;
6366 }
6367 unistr = PyUnicode_AS_UNICODE(uniobj);
6368 *fillcharloc = unistr[0];
6369 Py_DECREF(uniobj);
6370 return 1;
6371}
6372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006373PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006374"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006376Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006377done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379static PyObject *
6380unicode_center(PyUnicodeObject *self, PyObject *args)
6381{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 Py_ssize_t marg, left;
6383 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006384 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385
Thomas Woutersde017742006-02-16 19:34:37 +00006386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return NULL;
6388
Tim Peters7a29bd52001-09-12 03:03:31 +00006389 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 Py_INCREF(self);
6391 return (PyObject*) self;
6392 }
6393
6394 marg = width - self->length;
6395 left = marg / 2 + (marg & width & 1);
6396
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006397 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398}
6399
Marc-André Lemburge5034372000-08-08 08:04:29 +00006400#if 0
6401
6402/* This code should go into some future Unicode collation support
6403 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006404 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006405
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006406/* speedy UTF-16 code point order comparison */
6407/* gleaned from: */
6408/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6409
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006410static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006411{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006412 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006413 0, 0, 0, 0, 0, 0, 0, 0,
6414 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006415 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006416};
6417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418static int
6419unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006421 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 Py_UNICODE *s1 = str1->str;
6424 Py_UNICODE *s2 = str2->str;
6425
6426 len1 = str1->length;
6427 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006430 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006431
6432 c1 = *s1++;
6433 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006434
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006435 if (c1 > (1<<11) * 26)
6436 c1 += utf16Fixup[c1>>11];
6437 if (c2 > (1<<11) * 26)
6438 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006439 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006440
6441 if (c1 != c2)
6442 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006443
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006444 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
6446
6447 return (len1 < len2) ? -1 : (len1 != len2);
6448}
6449
Marc-André Lemburge5034372000-08-08 08:04:29 +00006450#else
6451
6452static int
6453unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006456
6457 Py_UNICODE *s1 = str1->str;
6458 Py_UNICODE *s2 = str2->str;
6459
6460 len1 = str1->length;
6461 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006462
Marc-André Lemburge5034372000-08-08 08:04:29 +00006463 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006464 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006465
Fredrik Lundh45714e92001-06-26 16:39:36 +00006466 c1 = *s1++;
6467 c2 = *s2++;
6468
6469 if (c1 != c2)
6470 return (c1 < c2) ? -1 : 1;
6471
Marc-André Lemburge5034372000-08-08 08:04:29 +00006472 len1--; len2--;
6473 }
6474
6475 return (len1 < len2) ? -1 : (len1 != len2);
6476}
6477
6478#endif
6479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480int PyUnicode_Compare(PyObject *left,
6481 PyObject *right)
6482{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006483 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6484 return unicode_compare((PyUnicodeObject *)left,
6485 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006486 PyErr_Format(PyExc_TypeError,
6487 "Can't compare %.100s and %.100s",
6488 left->ob_type->tp_name,
6489 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 return -1;
6491}
6492
Martin v. Löwis5b222132007-06-10 09:51:05 +00006493int
6494PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6495{
6496 int i;
6497 Py_UNICODE *id;
6498 assert(PyUnicode_Check(uni));
6499 id = PyUnicode_AS_UNICODE(uni);
6500 /* Compare Unicode string and source character set string */
6501 for (i = 0; id[i] && str[i]; i++)
6502 if (id[i] != str[i])
6503 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6504 if (id[i])
6505 return 1; /* uni is longer */
6506 if (str[i])
6507 return -1; /* str is longer */
6508 return 0;
6509}
6510
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006511PyObject *PyUnicode_RichCompare(PyObject *left,
6512 PyObject *right,
6513 int op)
6514{
6515 int result;
6516
6517 result = PyUnicode_Compare(left, right);
6518 if (result == -1 && PyErr_Occurred())
6519 goto onError;
6520
6521 /* Convert the return value to a Boolean */
6522 switch (op) {
6523 case Py_EQ:
6524 result = (result == 0);
6525 break;
6526 case Py_NE:
6527 result = (result != 0);
6528 break;
6529 case Py_LE:
6530 result = (result <= 0);
6531 break;
6532 case Py_GE:
6533 result = (result >= 0);
6534 break;
6535 case Py_LT:
6536 result = (result == -1);
6537 break;
6538 case Py_GT:
6539 result = (result == 1);
6540 break;
6541 }
6542 return PyBool_FromLong(result);
6543
6544 onError:
6545
6546 /* Standard case
6547
6548 Type errors mean that PyUnicode_FromObject() could not convert
6549 one of the arguments (usually the right hand side) to Unicode,
6550 ie. we can't handle the comparison request. However, it is
6551 possible that the other object knows a comparison method, which
6552 is why we return Py_NotImplemented to give the other object a
6553 chance.
6554
6555 */
6556 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6557 PyErr_Clear();
6558 Py_INCREF(Py_NotImplemented);
6559 return Py_NotImplemented;
6560 }
6561 if (op != Py_EQ && op != Py_NE)
6562 return NULL;
6563
6564 /* Equality comparison.
6565
6566 This is a special case: we silence any PyExc_UnicodeDecodeError
6567 and instead turn it into a PyErr_UnicodeWarning.
6568
6569 */
6570 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6571 return NULL;
6572 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006573 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6574 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006575 "equal comparison "
6576 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006577 "interpreting them as being unequal"
6578 :
6579 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006580 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006581 "interpreting them as being unequal",
6582 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006583 return NULL;
6584 result = (op == Py_NE);
6585 return PyBool_FromLong(result);
6586}
6587
Guido van Rossum403d68b2000-03-13 15:55:09 +00006588int PyUnicode_Contains(PyObject *container,
6589 PyObject *element)
6590{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006593
6594 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 sub = PyUnicode_FromObject(element);
6596 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006597 PyErr_Format(PyExc_TypeError,
6598 "'in <string>' requires string as left operand, not %s",
6599 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006600 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006601 }
6602
Thomas Wouters477c8d52006-05-27 19:21:47 +00006603 str = PyUnicode_FromObject(container);
6604 if (!str) {
6605 Py_DECREF(sub);
6606 return -1;
6607 }
6608
6609 result = stringlib_contains_obj(str, sub);
6610
6611 Py_DECREF(str);
6612 Py_DECREF(sub);
6613
Guido van Rossum403d68b2000-03-13 15:55:09 +00006614 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006615}
6616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617/* Concat to string or Unicode object giving a new Unicode object. */
6618
6619PyObject *PyUnicode_Concat(PyObject *left,
6620 PyObject *right)
6621{
6622 PyUnicodeObject *u = NULL, *v = NULL, *w;
6623
6624 /* Coerce the two arguments */
6625 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6626 if (u == NULL)
6627 goto onError;
6628 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6629 if (v == NULL)
6630 goto onError;
6631
6632 /* Shortcuts */
6633 if (v == unicode_empty) {
6634 Py_DECREF(v);
6635 return (PyObject *)u;
6636 }
6637 if (u == unicode_empty) {
6638 Py_DECREF(u);
6639 return (PyObject *)v;
6640 }
6641
6642 /* Concat the two Unicode strings */
6643 w = _PyUnicode_New(u->length + v->length);
6644 if (w == NULL)
6645 goto onError;
6646 Py_UNICODE_COPY(w->str, u->str, u->length);
6647 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6648
6649 Py_DECREF(u);
6650 Py_DECREF(v);
6651 return (PyObject *)w;
6652
6653onError:
6654 Py_XDECREF(u);
6655 Py_XDECREF(v);
6656 return NULL;
6657}
6658
Walter Dörwald1ab83302007-05-18 17:15:44 +00006659void
6660PyUnicode_Append(PyObject **pleft, PyObject *right)
6661{
6662 PyObject *new;
6663 if (*pleft == NULL)
6664 return;
6665 if (right == NULL || !PyUnicode_Check(*pleft)) {
6666 Py_DECREF(*pleft);
6667 *pleft = NULL;
6668 return;
6669 }
6670 new = PyUnicode_Concat(*pleft, right);
6671 Py_DECREF(*pleft);
6672 *pleft = new;
6673}
6674
6675void
6676PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6677{
6678 PyUnicode_Append(pleft, right);
6679 Py_XDECREF(right);
6680}
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683"S.count(sub[, start[, end]]) -> int\n\
6684\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006686string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006687interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject *
6690unicode_count(PyUnicodeObject *self, PyObject *args)
6691{
6692 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006694 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 PyObject *result;
6696
Guido van Rossumb8872e62000-05-09 14:14:27 +00006697 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 return NULL;
6700
6701 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006702 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (substring == NULL)
6704 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006705
Thomas Wouters477c8d52006-05-27 19:21:47 +00006706 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707
Christian Heimes217cfd12007-12-02 14:31:20 +00006708 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006709 stringlib_count(self->str + start, end - start,
6710 substring->str, substring->length)
6711 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
6713 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006714
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 return result;
6716}
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006719"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006721Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006722to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006723handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6725'xmlcharrefreplace' as well as any other name registered with\n\
6726codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
6728static PyObject *
6729unicode_encode(PyUnicodeObject *self, PyObject *args)
6730{
6731 char *encoding = NULL;
6732 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006733 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6736 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006737 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006738 if (v == NULL)
6739 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006740 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006741 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006742 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006743 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006744 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006745 Py_DECREF(v);
6746 return NULL;
6747 }
6748 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006749
6750 onError:
6751 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006752}
6753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006755"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756\n\
6757Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006758If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
6760static PyObject*
6761unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6762{
6763 Py_UNICODE *e;
6764 Py_UNICODE *p;
6765 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006766 Py_UNICODE *qe;
6767 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 PyUnicodeObject *u;
6769 int tabsize = 8;
6770
6771 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6772 return NULL;
6773
Thomas Wouters7e474022000-07-16 12:04:32 +00006774 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006775 i = 0; /* chars up to and including most recent \n or \r */
6776 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6777 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 for (p = self->str; p < e; p++)
6779 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006780 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006781 incr = tabsize - (j % tabsize); /* cannot overflow */
6782 if (j > PY_SSIZE_T_MAX - incr)
6783 goto overflow1;
6784 j += incr;
6785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
6787 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006788 if (j > PY_SSIZE_T_MAX - 1)
6789 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 j++;
6791 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006792 if (i > PY_SSIZE_T_MAX - j)
6793 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006795 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 }
6797 }
6798
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006799 if (i > PY_SSIZE_T_MAX - j)
6800 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006801
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 /* Second pass: create output string and fill it */
6803 u = _PyUnicode_New(i + j);
6804 if (!u)
6805 return NULL;
6806
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006807 j = 0; /* same as in first pass */
6808 q = u->str; /* next output char */
6809 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811 for (p = self->str; p < e; p++)
6812 if (*p == '\t') {
6813 if (tabsize > 0) {
6814 i = tabsize - (j % tabsize);
6815 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006816 while (i--) {
6817 if (q >= qe)
6818 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
6822 }
6823 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006824 if (q >= qe)
6825 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006827 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 if (*p == '\n' || *p == '\r')
6829 j = 0;
6830 }
6831
6832 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006833
6834 overflow2:
6835 Py_DECREF(u);
6836 overflow1:
6837 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839}
6840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006842"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843\n\
6844Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006845such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846arguments start and end are interpreted as in slice notation.\n\
6847\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
6850static PyObject *
6851unicode_find(PyUnicodeObject *self, PyObject *args)
6852{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006853 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006854 Py_ssize_t start;
6855 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006856 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857
Christian Heimes9cd17752007-11-18 19:35:23 +00006858 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860
Thomas Wouters477c8d52006-05-27 19:21:47 +00006861 result = stringlib_find_slice(
6862 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6863 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6864 start, end
6865 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866
6867 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006868
Christian Heimes217cfd12007-12-02 14:31:20 +00006869 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870}
6871
6872static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006873unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
6875 if (index < 0 || index >= self->length) {
6876 PyErr_SetString(PyExc_IndexError, "string index out of range");
6877 return NULL;
6878 }
6879
6880 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6881}
6882
Guido van Rossumc2504932007-09-18 19:42:40 +00006883/* Believe it or not, this produces the same value for ASCII strings
6884 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006886unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Guido van Rossumc2504932007-09-18 19:42:40 +00006888 Py_ssize_t len;
6889 Py_UNICODE *p;
6890 long x;
6891
6892 if (self->hash != -1)
6893 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006894 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006895 p = self->str;
6896 x = *p << 7;
6897 while (--len >= 0)
6898 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006899 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006900 if (x == -1)
6901 x = -2;
6902 self->hash = x;
6903 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006907"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
6911static PyObject *
6912unicode_index(PyUnicodeObject *self, PyObject *args)
6913{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006914 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006915 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006916 Py_ssize_t start;
6917 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Christian Heimes9cd17752007-11-18 19:35:23 +00006919 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
Thomas Wouters477c8d52006-05-27 19:21:47 +00006922 result = stringlib_find_slice(
6923 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6924 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6925 start, end
6926 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 if (result < 0) {
6931 PyErr_SetString(PyExc_ValueError, "substring not found");
6932 return NULL;
6933 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006934
Christian Heimes217cfd12007-12-02 14:31:20 +00006935 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949 int cased;
6950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 /* Shortcut for single character strings */
6952 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 e = p + PyUnicode_GET_SIZE(self);
6960 cased = 0;
6961 for (; p < e; p++) {
6962 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006963
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 else if (!cased && Py_UNICODE_ISLOWER(ch))
6967 cased = 1;
6968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970}
6971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006975Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006976at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977
6978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006979unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
6981 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6982 register const Py_UNICODE *e;
6983 int cased;
6984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 /* Shortcut for single character strings */
6986 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006989 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006990 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006992
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 e = p + PyUnicode_GET_SIZE(self);
6994 cased = 0;
6995 for (; p < e; p++) {
6996 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006997
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006999 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 else if (!cased && Py_UNICODE_ISUPPER(ch))
7001 cased = 1;
7002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004}
7005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007009Return True if S is a titlecased string and there is at least one\n\
7010character in S, i.e. upper- and titlecase characters may only\n\
7011follow uncased characters and lowercase characters only cased ones.\n\
7012Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
7014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016{
7017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7018 register const Py_UNICODE *e;
7019 int cased, previous_is_cased;
7020
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 /* Shortcut for single character strings */
7022 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007023 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7024 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007026 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007027 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007029
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 e = p + PyUnicode_GET_SIZE(self);
7031 cased = 0;
7032 previous_is_cased = 0;
7033 for (; p < e; p++) {
7034 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007035
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7037 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 previous_is_cased = 1;
7040 cased = 1;
7041 }
7042 else if (Py_UNICODE_ISLOWER(ch)) {
7043 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007044 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 previous_is_cased = 1;
7046 cased = 1;
7047 }
7048 else
7049 previous_is_cased = 0;
7050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007051 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007055"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007057Return True if all characters in S are whitespace\n\
7058and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
7063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7064 register const Py_UNICODE *e;
7065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 /* Shortcut for single character strings */
7067 if (PyUnicode_GET_SIZE(self) == 1 &&
7068 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007069 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007071 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007072 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007074
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 e = p + PyUnicode_GET_SIZE(self);
7076 for (; p < e; p++) {
7077 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007078 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081}
7082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007084"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007085\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007086Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007088
7089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007090unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007091{
7092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7093 register const Py_UNICODE *e;
7094
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007095 /* Shortcut for single character strings */
7096 if (PyUnicode_GET_SIZE(self) == 1 &&
7097 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007098 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007099
7100 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007101 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007102 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007103
7104 e = p + PyUnicode_GET_SIZE(self);
7105 for (; p < e; p++) {
7106 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007107 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007109 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007113"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007114\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007115Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007117
7118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007120{
7121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7122 register const Py_UNICODE *e;
7123
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007124 /* Shortcut for single character strings */
7125 if (PyUnicode_GET_SIZE(self) == 1 &&
7126 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007127 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007128
7129 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007130 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007131 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007132
7133 e = p + PyUnicode_GET_SIZE(self);
7134 for (; p < e; p++) {
7135 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007136 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007138 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007142"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007144Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007148unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149{
7150 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7151 register const Py_UNICODE *e;
7152
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 /* Shortcut for single character strings */
7154 if (PyUnicode_GET_SIZE(self) == 1 &&
7155 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007156 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007158 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007159 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007161
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 e = p + PyUnicode_GET_SIZE(self);
7163 for (; p < e; p++) {
7164 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007165 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007167 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168}
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007171"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007173Return True if all characters in S are digits\n\
7174and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175
7176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007177unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178{
7179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7180 register const Py_UNICODE *e;
7181
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 /* Shortcut for single character strings */
7183 if (PyUnicode_GET_SIZE(self) == 1 &&
7184 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007185 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007187 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007188 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007189 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007190
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 e = p + PyUnicode_GET_SIZE(self);
7192 for (; p < e; p++) {
7193 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007194 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007196 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197}
7198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007199PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007200"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007202Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007203False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204
7205static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007206unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207{
7208 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7209 register const Py_UNICODE *e;
7210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 /* Shortcut for single character strings */
7212 if (PyUnicode_GET_SIZE(self) == 1 &&
7213 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007214 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007216 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007217 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007218 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007219
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 e = p + PyUnicode_GET_SIZE(self);
7221 for (; p < e; p++) {
7222 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007223 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007225 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Martin v. Löwis47383402007-08-15 07:32:56 +00007228int
7229PyUnicode_IsIdentifier(PyObject *self)
7230{
7231 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7232 register const Py_UNICODE *e;
7233
7234 /* Special case for empty strings */
7235 if (PyUnicode_GET_SIZE(self) == 0)
7236 return 0;
7237
7238 /* PEP 3131 says that the first character must be in
7239 XID_Start and subsequent characters in XID_Continue,
7240 and for the ASCII range, the 2.x rules apply (i.e
7241 start with letters and underscore, continue with
7242 letters, digits, underscore). However, given the current
7243 definition of XID_Start and XID_Continue, it is sufficient
7244 to check just for these, except that _ must be allowed
7245 as starting an identifier. */
7246 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7247 return 0;
7248
7249 e = p + PyUnicode_GET_SIZE(self);
7250 for (p++; p < e; p++) {
7251 if (!_PyUnicode_IsXidContinue(*p))
7252 return 0;
7253 }
7254 return 1;
7255}
7256
7257PyDoc_STRVAR(isidentifier__doc__,
7258"S.isidentifier() -> bool\n\
7259\n\
7260Return True if S is a valid identifier according\n\
7261to the language definition.");
7262
7263static PyObject*
7264unicode_isidentifier(PyObject *self)
7265{
7266 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7267}
7268
Georg Brandl559e5d72008-06-11 18:37:52 +00007269PyDoc_STRVAR(isprintable__doc__,
7270"S.isprintable() -> bool\n\
7271\n\
7272Return True if all characters in S are considered\n\
7273printable in repr() or S is empty, False otherwise.");
7274
7275static PyObject*
7276unicode_isprintable(PyObject *self)
7277{
7278 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7279 register const Py_UNICODE *e;
7280
7281 /* Shortcut for single character strings */
7282 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7283 Py_RETURN_TRUE;
7284 }
7285
7286 e = p + PyUnicode_GET_SIZE(self);
7287 for (; p < e; p++) {
7288 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7289 Py_RETURN_FALSE;
7290 }
7291 }
7292 Py_RETURN_TRUE;
7293}
7294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007295PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007296"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297\n\
7298Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007299sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300
7301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007302unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007304 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305}
7306
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308unicode_length(PyUnicodeObject *self)
7309{
7310 return self->length;
7311}
7312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007313PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007314"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007316Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007317done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
7319static PyObject *
7320unicode_ljust(PyUnicodeObject *self, PyObject *args)
7321{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007322 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007323 Py_UNICODE fillchar = ' ';
7324
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007325 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 return NULL;
7327
Tim Peters7a29bd52001-09-12 03:03:31 +00007328 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 Py_INCREF(self);
7330 return (PyObject*) self;
7331 }
7332
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007333 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334}
7335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007337"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007339Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007342unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 return fixup(self, fixlower);
7345}
7346
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007347#define LEFTSTRIP 0
7348#define RIGHTSTRIP 1
7349#define BOTHSTRIP 2
7350
7351/* Arrays indexed by above */
7352static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7353
7354#define STRIPNAME(i) (stripformat[i]+3)
7355
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007356/* externally visible for str.strip(unicode) */
7357PyObject *
7358_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7359{
7360 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007361 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007362 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7364 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007365
Thomas Wouters477c8d52006-05-27 19:21:47 +00007366 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7367
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007368 i = 0;
7369 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007370 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7371 i++;
7372 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007373 }
7374
7375 j = len;
7376 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007377 do {
7378 j--;
7379 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7380 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007381 }
7382
7383 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007384 Py_INCREF(self);
7385 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007386 }
7387 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007388 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007389}
7390
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
7392static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007393do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007395 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007396 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007397
7398 i = 0;
7399 if (striptype != RIGHTSTRIP) {
7400 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7401 i++;
7402 }
7403 }
7404
7405 j = len;
7406 if (striptype != LEFTSTRIP) {
7407 do {
7408 j--;
7409 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7410 j++;
7411 }
7412
7413 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7414 Py_INCREF(self);
7415 return (PyObject*)self;
7416 }
7417 else
7418 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419}
7420
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007421
7422static PyObject *
7423do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7424{
7425 PyObject *sep = NULL;
7426
7427 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7428 return NULL;
7429
7430 if (sep != NULL && sep != Py_None) {
7431 if (PyUnicode_Check(sep))
7432 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007433 else {
7434 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007435 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007436 STRIPNAME(striptype));
7437 return NULL;
7438 }
7439 }
7440
7441 return do_strip(self, striptype);
7442}
7443
7444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007446"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007447\n\
7448Return a copy of the string S with leading and trailing\n\
7449whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007450If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007451
7452static PyObject *
7453unicode_strip(PyUnicodeObject *self, PyObject *args)
7454{
7455 if (PyTuple_GET_SIZE(args) == 0)
7456 return do_strip(self, BOTHSTRIP); /* Common case */
7457 else
7458 return do_argstrip(self, BOTHSTRIP, args);
7459}
7460
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007463"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007464\n\
7465Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007466If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007467
7468static PyObject *
7469unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7470{
7471 if (PyTuple_GET_SIZE(args) == 0)
7472 return do_strip(self, LEFTSTRIP); /* Common case */
7473 else
7474 return do_argstrip(self, LEFTSTRIP, args);
7475}
7476
7477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007479"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007480\n\
7481Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007482If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007483
7484static PyObject *
7485unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7486{
7487 if (PyTuple_GET_SIZE(args) == 0)
7488 return do_strip(self, RIGHTSTRIP); /* Common case */
7489 else
7490 return do_argstrip(self, RIGHTSTRIP, args);
7491}
7492
7493
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007495unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496{
7497 PyUnicodeObject *u;
7498 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007500 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501
7502 if (len < 0)
7503 len = 0;
7504
Tim Peters7a29bd52001-09-12 03:03:31 +00007505 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 /* no repeat, return original string */
7507 Py_INCREF(str);
7508 return (PyObject*) str;
7509 }
Tim Peters8f422462000-09-09 06:13:41 +00007510
7511 /* ensure # of chars needed doesn't overflow int and # of bytes
7512 * needed doesn't overflow size_t
7513 */
7514 nchars = len * str->length;
7515 if (len && nchars / len != str->length) {
7516 PyErr_SetString(PyExc_OverflowError,
7517 "repeated string is too long");
7518 return NULL;
7519 }
7520 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7521 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7522 PyErr_SetString(PyExc_OverflowError,
7523 "repeated string is too long");
7524 return NULL;
7525 }
7526 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 if (!u)
7528 return NULL;
7529
7530 p = u->str;
7531
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 if (str->length == 1 && len > 0) {
7533 Py_UNICODE_FILL(p, str->str[0], len);
7534 } else {
7535 Py_ssize_t done = 0; /* number of characters copied this far */
7536 if (done < nchars) {
7537 Py_UNICODE_COPY(p, str->str, str->length);
7538 done = str->length;
7539 }
7540 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007541 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007542 Py_UNICODE_COPY(p+done, p, n);
7543 done += n;
7544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 }
7546
7547 return (PyObject*) u;
7548}
7549
7550PyObject *PyUnicode_Replace(PyObject *obj,
7551 PyObject *subobj,
7552 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554{
7555 PyObject *self;
7556 PyObject *str1;
7557 PyObject *str2;
7558 PyObject *result;
7559
7560 self = PyUnicode_FromObject(obj);
7561 if (self == NULL)
7562 return NULL;
7563 str1 = PyUnicode_FromObject(subobj);
7564 if (str1 == NULL) {
7565 Py_DECREF(self);
7566 return NULL;
7567 }
7568 str2 = PyUnicode_FromObject(replobj);
7569 if (str2 == NULL) {
7570 Py_DECREF(self);
7571 Py_DECREF(str1);
7572 return NULL;
7573 }
Tim Petersced69f82003-09-16 20:30:58 +00007574 result = replace((PyUnicodeObject *)self,
7575 (PyUnicodeObject *)str1,
7576 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 maxcount);
7578 Py_DECREF(self);
7579 Py_DECREF(str1);
7580 Py_DECREF(str2);
7581 return result;
7582}
7583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007584PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007585"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586\n\
7587Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007588old replaced by new. If the optional argument count is\n\
7589given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591static PyObject*
7592unicode_replace(PyUnicodeObject *self, PyObject *args)
7593{
7594 PyUnicodeObject *str1;
7595 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007596 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 PyObject *result;
7598
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 return NULL;
7601 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7602 if (str1 == NULL)
7603 return NULL;
7604 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007605 if (str2 == NULL) {
7606 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
7610 result = replace(self, str1, str2, maxcount);
7611
7612 Py_DECREF(str1);
7613 Py_DECREF(str2);
7614 return result;
7615}
7616
7617static
7618PyObject *unicode_repr(PyObject *unicode)
7619{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007620 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007621 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007622 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7623 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7624
7625 /* XXX(nnorwitz): rather than over-allocating, it would be
7626 better to choose a different scheme. Perhaps scan the
7627 first N-chars of the string and allocate based on that size.
7628 */
7629 /* Initial allocation is based on the longest-possible unichr
7630 escape.
7631
7632 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7633 unichr, so in this case it's the longest unichr escape. In
7634 narrow (UTF-16) builds this is five chars per source unichr
7635 since there are two unichrs in the surrogate pair, so in narrow
7636 (UTF-16) builds it's not the longest unichr escape.
7637
7638 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7639 so in the narrow (UTF-16) build case it's the longest unichr
7640 escape.
7641 */
7642
Walter Dörwald1ab83302007-05-18 17:15:44 +00007643 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007644 2 /* quotes */
7645#ifdef Py_UNICODE_WIDE
7646 + 10*size
7647#else
7648 + 6*size
7649#endif
7650 + 1);
7651 if (repr == NULL)
7652 return NULL;
7653
Walter Dörwald1ab83302007-05-18 17:15:44 +00007654 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007655
7656 /* Add quote */
7657 *p++ = (findchar(s, size, '\'') &&
7658 !findchar(s, size, '"')) ? '"' : '\'';
7659 while (size-- > 0) {
7660 Py_UNICODE ch = *s++;
7661
7662 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007663 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007664 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007665 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007666 continue;
7667 }
7668
Georg Brandl559e5d72008-06-11 18:37:52 +00007669 /* Map special whitespace to '\t', \n', '\r' */
7670 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007671 *p++ = '\\';
7672 *p++ = 't';
7673 }
7674 else if (ch == '\n') {
7675 *p++ = '\\';
7676 *p++ = 'n';
7677 }
7678 else if (ch == '\r') {
7679 *p++ = '\\';
7680 *p++ = 'r';
7681 }
7682
7683 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007684 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007685 *p++ = '\\';
7686 *p++ = 'x';
7687 *p++ = hexdigits[(ch >> 4) & 0x000F];
7688 *p++ = hexdigits[ch & 0x000F];
7689 }
7690
Georg Brandl559e5d72008-06-11 18:37:52 +00007691 /* Copy ASCII characters as-is */
7692 else if (ch < 0x7F) {
7693 *p++ = ch;
7694 }
7695
7696 /* Non-ASCII characters */
7697 else {
7698 Py_UCS4 ucs = ch;
7699
7700#ifndef Py_UNICODE_WIDE
7701 Py_UNICODE ch2 = 0;
7702 /* Get code point from surrogate pair */
7703 if (size > 0) {
7704 ch2 = *s;
7705 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7706 && ch2 <= 0xDFFF) {
7707 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7708 + 0x00010000;
7709 s++;
7710 size--;
7711 }
7712 }
7713#endif
7714 /* Map Unicode whitespace and control characters
7715 (categories Z* and C* except ASCII space)
7716 */
7717 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7718 /* Map 8-bit characters to '\xhh' */
7719 if (ucs <= 0xff) {
7720 *p++ = '\\';
7721 *p++ = 'x';
7722 *p++ = hexdigits[(ch >> 4) & 0x000F];
7723 *p++ = hexdigits[ch & 0x000F];
7724 }
7725 /* Map 21-bit characters to '\U00xxxxxx' */
7726 else if (ucs >= 0x10000) {
7727 *p++ = '\\';
7728 *p++ = 'U';
7729 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7730 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7731 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7732 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7733 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7734 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7735 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7736 *p++ = hexdigits[ucs & 0x0000000F];
7737 }
7738 /* Map 16-bit characters to '\uxxxx' */
7739 else {
7740 *p++ = '\\';
7741 *p++ = 'u';
7742 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7743 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7744 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7745 *p++ = hexdigits[ucs & 0x000F];
7746 }
7747 }
7748 /* Copy characters as-is */
7749 else {
7750 *p++ = ch;
7751#ifndef Py_UNICODE_WIDE
7752 if (ucs >= 0x10000)
7753 *p++ = ch2;
7754#endif
7755 }
7756 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007757 }
7758 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007759 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007760
7761 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007762 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007763 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764}
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007767"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768\n\
7769Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007770such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771arguments start and end are interpreted as in slice notation.\n\
7772\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007773Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774
7775static PyObject *
7776unicode_rfind(PyUnicodeObject *self, PyObject *args)
7777{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007778 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007779 Py_ssize_t start;
7780 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007781 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
Christian Heimes9cd17752007-11-18 19:35:23 +00007783 if (!_ParseTupleFinds(args, &substring, &start, &end))
7784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
Thomas Wouters477c8d52006-05-27 19:21:47 +00007786 result = stringlib_rfind_slice(
7787 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7788 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7789 start, end
7790 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
7792 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007793
Christian Heimes217cfd12007-12-02 14:31:20 +00007794 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795}
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007798"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007800Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
7802static PyObject *
7803unicode_rindex(PyUnicodeObject *self, PyObject *args)
7804{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007805 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007806 Py_ssize_t start;
7807 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007808 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
Christian Heimes9cd17752007-11-18 19:35:23 +00007810 if (!_ParseTupleFinds(args, &substring, &start, &end))
7811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
Thomas Wouters477c8d52006-05-27 19:21:47 +00007813 result = stringlib_rfind_slice(
7814 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7815 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7816 start, end
7817 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818
7819 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007820
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 if (result < 0) {
7822 PyErr_SetString(PyExc_ValueError, "substring not found");
7823 return NULL;
7824 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007825 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826}
7827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007829"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007831Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007832done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject *
7835unicode_rjust(PyUnicodeObject *self, PyObject *args)
7836{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007837 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007838 Py_UNICODE fillchar = ' ';
7839
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007840 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 return NULL;
7842
Tim Peters7a29bd52001-09-12 03:03:31 +00007843 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 Py_INCREF(self);
7845 return (PyObject*) self;
7846 }
7847
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007848 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849}
7850
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851PyObject *PyUnicode_Split(PyObject *s,
7852 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007853 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854{
7855 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007856
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 s = PyUnicode_FromObject(s);
7858 if (s == NULL)
7859 return NULL;
7860 if (sep != NULL) {
7861 sep = PyUnicode_FromObject(sep);
7862 if (sep == NULL) {
7863 Py_DECREF(s);
7864 return NULL;
7865 }
7866 }
7867
7868 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7869
7870 Py_DECREF(s);
7871 Py_XDECREF(sep);
7872 return result;
7873}
7874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007875PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007876"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877\n\
7878Return a list of the words in S, using sep as the\n\
7879delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007880splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007881whitespace string is a separator and empty strings are\n\
7882removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883
7884static PyObject*
7885unicode_split(PyUnicodeObject *self, PyObject *args)
7886{
7887 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007888 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
Martin v. Löwis18e16552006-02-15 17:27:45 +00007890 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 return NULL;
7892
7893 if (substring == Py_None)
7894 return split(self, NULL, maxcount);
7895 else if (PyUnicode_Check(substring))
7896 return split(self, (PyUnicodeObject *)substring, maxcount);
7897 else
7898 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7899}
7900
Thomas Wouters477c8d52006-05-27 19:21:47 +00007901PyObject *
7902PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7903{
7904 PyObject* str_obj;
7905 PyObject* sep_obj;
7906 PyObject* out;
7907
7908 str_obj = PyUnicode_FromObject(str_in);
7909 if (!str_obj)
7910 return NULL;
7911 sep_obj = PyUnicode_FromObject(sep_in);
7912 if (!sep_obj) {
7913 Py_DECREF(str_obj);
7914 return NULL;
7915 }
7916
7917 out = stringlib_partition(
7918 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7919 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7920 );
7921
7922 Py_DECREF(sep_obj);
7923 Py_DECREF(str_obj);
7924
7925 return out;
7926}
7927
7928
7929PyObject *
7930PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7931{
7932 PyObject* str_obj;
7933 PyObject* sep_obj;
7934 PyObject* out;
7935
7936 str_obj = PyUnicode_FromObject(str_in);
7937 if (!str_obj)
7938 return NULL;
7939 sep_obj = PyUnicode_FromObject(sep_in);
7940 if (!sep_obj) {
7941 Py_DECREF(str_obj);
7942 return NULL;
7943 }
7944
7945 out = stringlib_rpartition(
7946 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7947 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7948 );
7949
7950 Py_DECREF(sep_obj);
7951 Py_DECREF(str_obj);
7952
7953 return out;
7954}
7955
7956PyDoc_STRVAR(partition__doc__,
7957"S.partition(sep) -> (head, sep, tail)\n\
7958\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007959Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007961found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962
7963static PyObject*
7964unicode_partition(PyUnicodeObject *self, PyObject *separator)
7965{
7966 return PyUnicode_Partition((PyObject *)self, separator);
7967}
7968
7969PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007970"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007971\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007972Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007973the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007974separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007975
7976static PyObject*
7977unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7978{
7979 return PyUnicode_RPartition((PyObject *)self, separator);
7980}
7981
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007982PyObject *PyUnicode_RSplit(PyObject *s,
7983 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007984 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007985{
7986 PyObject *result;
7987
7988 s = PyUnicode_FromObject(s);
7989 if (s == NULL)
7990 return NULL;
7991 if (sep != NULL) {
7992 sep = PyUnicode_FromObject(sep);
7993 if (sep == NULL) {
7994 Py_DECREF(s);
7995 return NULL;
7996 }
7997 }
7998
7999 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8000
8001 Py_DECREF(s);
8002 Py_XDECREF(sep);
8003 return result;
8004}
8005
8006PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008007"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008008\n\
8009Return a list of the words in S, using sep as the\n\
8010delimiter string, starting at the end of the string and\n\
8011working to the front. If maxsplit is given, at most maxsplit\n\
8012splits are done. If sep is not specified, any whitespace string\n\
8013is a separator.");
8014
8015static PyObject*
8016unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8017{
8018 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008020
Martin v. Löwis18e16552006-02-15 17:27:45 +00008021 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008022 return NULL;
8023
8024 if (substring == Py_None)
8025 return rsplit(self, NULL, maxcount);
8026 else if (PyUnicode_Check(substring))
8027 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8028 else
8029 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8030}
8031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008032PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00008033"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034\n\
8035Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008036Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
8039static PyObject*
8040unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8041{
Guido van Rossum86662912000-04-11 15:38:46 +00008042 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
Guido van Rossum86662912000-04-11 15:38:46 +00008044 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046
Guido van Rossum86662912000-04-11 15:38:46 +00008047 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048}
8049
8050static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008051PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052{
Walter Dörwald346737f2007-05-31 10:44:43 +00008053 if (PyUnicode_CheckExact(self)) {
8054 Py_INCREF(self);
8055 return self;
8056 } else
8057 /* Subtype -- return genuine unicode string with the same value. */
8058 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8059 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060}
8061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008062PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008063"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064\n\
8065Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008066and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
8068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008069unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 return fixup(self, fixswapcase);
8072}
8073
Georg Brandlceee0772007-11-27 23:48:05 +00008074PyDoc_STRVAR(maketrans__doc__,
8075"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8076\n\
8077Return a translation table usable for str.translate().\n\
8078If there is only one argument, it must be a dictionary mapping Unicode\n\
8079ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008080Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008081If there are two arguments, they must be strings of equal length, and\n\
8082in the resulting dictionary, each character in x will be mapped to the\n\
8083character at the same position in y. If there is a third argument, it\n\
8084must be a string, whose characters will be mapped to None in the result.");
8085
8086static PyObject*
8087unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8088{
8089 PyObject *x, *y = NULL, *z = NULL;
8090 PyObject *new = NULL, *key, *value;
8091 Py_ssize_t i = 0;
8092 int res;
8093
8094 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8095 return NULL;
8096 new = PyDict_New();
8097 if (!new)
8098 return NULL;
8099 if (y != NULL) {
8100 /* x must be a string too, of equal length */
8101 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8102 if (!PyUnicode_Check(x)) {
8103 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8104 "be a string if there is a second argument");
8105 goto err;
8106 }
8107 if (PyUnicode_GET_SIZE(x) != ylen) {
8108 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8109 "arguments must have equal length");
8110 goto err;
8111 }
8112 /* create entries for translating chars in x to those in y */
8113 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008114 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8115 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008116 if (!key || !value)
8117 goto err;
8118 res = PyDict_SetItem(new, key, value);
8119 Py_DECREF(key);
8120 Py_DECREF(value);
8121 if (res < 0)
8122 goto err;
8123 }
8124 /* create entries for deleting chars in z */
8125 if (z != NULL) {
8126 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008127 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008128 if (!key)
8129 goto err;
8130 res = PyDict_SetItem(new, key, Py_None);
8131 Py_DECREF(key);
8132 if (res < 0)
8133 goto err;
8134 }
8135 }
8136 } else {
8137 /* x must be a dict */
8138 if (!PyDict_Check(x)) {
8139 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8140 "to maketrans it must be a dict");
8141 goto err;
8142 }
8143 /* copy entries into the new dict, converting string keys to int keys */
8144 while (PyDict_Next(x, &i, &key, &value)) {
8145 if (PyUnicode_Check(key)) {
8146 /* convert string keys to integer keys */
8147 PyObject *newkey;
8148 if (PyUnicode_GET_SIZE(key) != 1) {
8149 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8150 "table must be of length 1");
8151 goto err;
8152 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008153 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008154 if (!newkey)
8155 goto err;
8156 res = PyDict_SetItem(new, newkey, value);
8157 Py_DECREF(newkey);
8158 if (res < 0)
8159 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008160 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008161 /* just keep integer keys */
8162 if (PyDict_SetItem(new, key, value) < 0)
8163 goto err;
8164 } else {
8165 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8166 "be strings or integers");
8167 goto err;
8168 }
8169 }
8170 }
8171 return new;
8172 err:
8173 Py_DECREF(new);
8174 return NULL;
8175}
8176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008177PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008178"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179\n\
8180Return a copy of the string S, where all characters have been mapped\n\
8181through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008182Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008183Unmapped characters are left untouched. Characters mapped to None\n\
8184are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
8186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008187unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188{
Georg Brandlceee0772007-11-27 23:48:05 +00008189 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190}
8191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008192PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008193"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008195Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
8197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008198unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 return fixup(self, fixupper);
8201}
8202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008203PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008204"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008206Pad a numeric string S with zeros on the left, to fill a field\n\
8207of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208
8209static PyObject *
8210unicode_zfill(PyUnicodeObject *self, PyObject *args)
8211{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008212 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 PyUnicodeObject *u;
8214
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215 Py_ssize_t width;
8216 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 return NULL;
8218
8219 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008220 if (PyUnicode_CheckExact(self)) {
8221 Py_INCREF(self);
8222 return (PyObject*) self;
8223 }
8224 else
8225 return PyUnicode_FromUnicode(
8226 PyUnicode_AS_UNICODE(self),
8227 PyUnicode_GET_SIZE(self)
8228 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 }
8230
8231 fill = width - self->length;
8232
8233 u = pad(self, fill, 0, '0');
8234
Walter Dörwald068325e2002-04-15 13:36:47 +00008235 if (u == NULL)
8236 return NULL;
8237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 if (u->str[fill] == '+' || u->str[fill] == '-') {
8239 /* move sign to beginning of string */
8240 u->str[0] = u->str[fill];
8241 u->str[fill] = '0';
8242 }
8243
8244 return (PyObject*) u;
8245}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246
8247#if 0
8248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008249unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250{
Christian Heimes2202f872008-02-06 14:31:34 +00008251 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252}
8253#endif
8254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008255PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008256"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008258Return True if S starts with the specified prefix, False otherwise.\n\
8259With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008260With optional end, stop comparing S at that position.\n\
8261prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262
8263static PyObject *
8264unicode_startswith(PyUnicodeObject *self,
8265 PyObject *args)
8266{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008267 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008269 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008270 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008271 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008273 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008274 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008276 if (PyTuple_Check(subobj)) {
8277 Py_ssize_t i;
8278 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8279 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8280 PyTuple_GET_ITEM(subobj, i));
8281 if (substring == NULL)
8282 return NULL;
8283 result = tailmatch(self, substring, start, end, -1);
8284 Py_DECREF(substring);
8285 if (result) {
8286 Py_RETURN_TRUE;
8287 }
8288 }
8289 /* nothing matched */
8290 Py_RETURN_FALSE;
8291 }
8292 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008294 return NULL;
8295 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008297 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298}
8299
8300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008301PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008302"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008304Return True if S ends with the specified suffix, False otherwise.\n\
8305With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008306With optional end, stop comparing S at that position.\n\
8307suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
8309static PyObject *
8310unicode_endswith(PyUnicodeObject *self,
8311 PyObject *args)
8312{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008313 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008315 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008316 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008317 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008319 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8320 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008322 if (PyTuple_Check(subobj)) {
8323 Py_ssize_t i;
8324 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8325 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8326 PyTuple_GET_ITEM(subobj, i));
8327 if (substring == NULL)
8328 return NULL;
8329 result = tailmatch(self, substring, start, end, +1);
8330 Py_DECREF(substring);
8331 if (result) {
8332 Py_RETURN_TRUE;
8333 }
8334 }
8335 Py_RETURN_FALSE;
8336 }
8337 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008341 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008343 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344}
8345
Eric Smith8c663262007-08-25 02:26:07 +00008346#include "stringlib/string_format.h"
8347
8348PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008349"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008350\n\
8351");
8352
Eric Smith4a7d76d2008-05-30 18:10:19 +00008353static PyObject *
8354unicode__format__(PyObject* self, PyObject* args)
8355{
8356 PyObject *format_spec;
8357
8358 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8359 return NULL;
8360
8361 return _PyUnicode_FormatAdvanced(self,
8362 PyUnicode_AS_UNICODE(format_spec),
8363 PyUnicode_GET_SIZE(format_spec));
8364}
8365
Eric Smith8c663262007-08-25 02:26:07 +00008366PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008367"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008368\n\
8369");
8370
8371static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008372unicode__sizeof__(PyUnicodeObject *v)
8373{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008374 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8375 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008376}
8377
8378PyDoc_STRVAR(sizeof__doc__,
8379"S.__sizeof__() -> size of S in memory, in bytes");
8380
8381static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008382unicode_getnewargs(PyUnicodeObject *v)
8383{
8384 return Py_BuildValue("(u#)", v->str, v->length);
8385}
8386
8387
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388static PyMethodDef unicode_methods[] = {
8389
8390 /* Order is according to common usage: often used methods should
8391 appear first, since lookup is done sequentially. */
8392
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008393 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8394 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8395 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008396 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008397 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8398 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8399 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8400 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8401 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8402 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8403 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008404 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008405 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8406 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8407 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008408 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008409 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8410 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8411 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008412 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008413 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008414 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008415 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008416 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8417 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8418 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8419 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8420 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8421 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8422 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8423 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8424 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8425 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8426 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8427 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8428 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8429 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008430 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008431 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008432 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008433 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008434 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008435 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8436 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008437 {"maketrans", (PyCFunction) unicode_maketrans,
8438 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008439 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008440#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008441 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442#endif
8443
8444#if 0
8445 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008446 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447#endif
8448
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008449 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 {NULL, NULL}
8451};
8452
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008453static PyObject *
8454unicode_mod(PyObject *v, PyObject *w)
8455{
8456 if (!PyUnicode_Check(v)) {
8457 Py_INCREF(Py_NotImplemented);
8458 return Py_NotImplemented;
8459 }
8460 return PyUnicode_Format(v, w);
8461}
8462
8463static PyNumberMethods unicode_as_number = {
8464 0, /*nb_add*/
8465 0, /*nb_subtract*/
8466 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008467 unicode_mod, /*nb_remainder*/
8468};
8469
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008471 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008472 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8474 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008475 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 0, /* sq_ass_item */
8477 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008478 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479};
8480
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008481static PyObject*
8482unicode_subscript(PyUnicodeObject* self, PyObject* item)
8483{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008484 if (PyIndex_Check(item)) {
8485 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008486 if (i == -1 && PyErr_Occurred())
8487 return NULL;
8488 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008489 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008490 return unicode_getitem(self, i);
8491 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008493 Py_UNICODE* source_buf;
8494 Py_UNICODE* result_buf;
8495 PyObject* result;
8496
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008497 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008498 &start, &stop, &step, &slicelength) < 0) {
8499 return NULL;
8500 }
8501
8502 if (slicelength <= 0) {
8503 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008504 } else if (start == 0 && step == 1 && slicelength == self->length &&
8505 PyUnicode_CheckExact(self)) {
8506 Py_INCREF(self);
8507 return (PyObject *)self;
8508 } else if (step == 1) {
8509 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008510 } else {
8511 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008512 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8513 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008514
8515 if (result_buf == NULL)
8516 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008517
8518 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8519 result_buf[i] = source_buf[cur];
8520 }
Tim Petersced69f82003-09-16 20:30:58 +00008521
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008522 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008523 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008524 return result;
8525 }
8526 } else {
8527 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8528 return NULL;
8529 }
8530}
8531
8532static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008533 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008534 (binaryfunc)unicode_subscript, /* mp_subscript */
8535 (objobjargproc)0, /* mp_ass_subscript */
8536};
8537
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539/* Helpers for PyUnicode_Format() */
8540
8541static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 if (argidx < arglen) {
8546 (*p_argidx)++;
8547 if (arglen < 0)
8548 return args;
8549 else
8550 return PyTuple_GetItem(args, argidx);
8551 }
8552 PyErr_SetString(PyExc_TypeError,
8553 "not enough arguments for format string");
8554 return NULL;
8555}
8556
Martin v. Löwis18e16552006-02-15 17:27:45 +00008557static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008558strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008560 register Py_ssize_t i;
8561 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 for (i = len - 1; i >= 0; i--)
8563 buffer[i] = (Py_UNICODE) charbuffer[i];
8564
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 return len;
8566}
8567
Neal Norwitzfc76d632006-01-10 06:03:13 +00008568static int
8569doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8570{
Tim Peters15231542006-02-16 01:08:01 +00008571 Py_ssize_t result;
8572
Neal Norwitzfc76d632006-01-10 06:03:13 +00008573 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008574 result = strtounicode(buffer, (char *)buffer);
8575 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008576}
8577
Christian Heimes3fd13992008-03-21 01:05:49 +00008578#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008579static int
8580longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8581{
Tim Peters15231542006-02-16 01:08:01 +00008582 Py_ssize_t result;
8583
Neal Norwitzfc76d632006-01-10 06:03:13 +00008584 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008585 result = strtounicode(buffer, (char *)buffer);
8586 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008587}
Christian Heimes3fd13992008-03-21 01:05:49 +00008588#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008589
Guido van Rossum078151d2002-08-11 04:24:12 +00008590/* XXX To save some code duplication, formatfloat/long/int could have been
8591 shared with stringobject.c, converting from 8-bit to Unicode after the
8592 formatting is done. */
8593
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594static int
8595formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008596 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 int flags,
8598 int prec,
8599 int type,
8600 PyObject *v)
8601{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008602 /* fmt = '%#.' + `prec` + `type`
8603 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 char fmt[20];
8605 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008606
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 x = PyFloat_AsDouble(v);
8608 if (x == -1.0 && PyErr_Occurred())
8609 return -1;
8610 if (prec < 0)
8611 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008612 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8613 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008614 /* Worst case length calc to ensure no buffer overrun:
8615
8616 'g' formats:
8617 fmt = %#.<prec>g
8618 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8619 for any double rep.)
8620 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8621
8622 'f' formats:
8623 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8624 len = 1 + 50 + 1 + prec = 52 + prec
8625
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008626 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008627 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008628
8629 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008630 if (((type == 'g' || type == 'G') &&
8631 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008632 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008633 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008634 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008635 return -1;
8636 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008637 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8638 (flags&F_ALT) ? "#" : "",
8639 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008640 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641}
8642
Tim Peters38fd5b62000-09-21 05:43:11 +00008643static PyObject*
8644formatlong(PyObject *val, int flags, int prec, int type)
8645{
8646 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008647 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008648 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008649 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008650
Christian Heimes72b710a2008-05-26 13:28:38 +00008651 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008652 if (!str)
8653 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008654 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008655 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008656 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008657}
8658
Christian Heimes3fd13992008-03-21 01:05:49 +00008659#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660static int
8661formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008662 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 int flags,
8664 int prec,
8665 int type,
8666 PyObject *v)
8667{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008668 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008669 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8670 * + 1 + 1
8671 * = 24
8672 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008673 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008674 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 long x;
8676
Christian Heimes217cfd12007-12-02 14:31:20 +00008677 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008680 if (x < 0 && type == 'u') {
8681 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008682 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008683 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8684 sign = "-";
8685 else
8686 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008688 prec = 1;
8689
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008690 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8691 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008692 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008693 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008694 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008695 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008696 return -1;
8697 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008698
8699 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008700 (type == 'x' || type == 'X' || type == 'o')) {
8701 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008702 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008703 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008704 * - when 0 is being converted, the C standard leaves off
8705 * the '0x' or '0X', which is inconsistent with other
8706 * %#x/%#X conversions and inconsistent with Python's
8707 * hex() function
8708 * - there are platforms that violate the standard and
8709 * convert 0 with the '0x' or '0X'
8710 * (Metrowerks, Compaq Tru64)
8711 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008712 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008713 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008714 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008715 * We can achieve the desired consistency by inserting our
8716 * own '0x' or '0X' prefix, and substituting %x/%X in place
8717 * of %#x/%#X.
8718 *
8719 * Note that this is the same approach as used in
8720 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008721 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008722 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8723 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008724 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008725 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008726 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8727 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008728 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008729 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008730 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008731 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008732 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008733 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734}
Christian Heimes3fd13992008-03-21 01:05:49 +00008735#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736
8737static int
8738formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008739 size_t buflen,
8740 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008742 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008743 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008744 if (PyUnicode_GET_SIZE(v) == 1) {
8745 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8746 buf[1] = '\0';
8747 return 1;
8748 }
8749#ifndef Py_UNICODE_WIDE
8750 if (PyUnicode_GET_SIZE(v) == 2) {
8751 /* Decode a valid surrogate pair */
8752 int c0 = PyUnicode_AS_UNICODE(v)[0];
8753 int c1 = PyUnicode_AS_UNICODE(v)[1];
8754 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8755 0xDC00 <= c1 && c1 <= 0xDFFF) {
8756 buf[0] = c0;
8757 buf[1] = c1;
8758 buf[2] = '\0';
8759 return 2;
8760 }
8761 }
8762#endif
8763 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 else {
8766 /* Integer input truncated to a character */
8767 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008768 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008770 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008771
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008772 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008773 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008774 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008775 return -1;
8776 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008777
8778#ifndef Py_UNICODE_WIDE
8779 if (x > 0xffff) {
8780 x -= 0x10000;
8781 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8782 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8783 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008784 }
8785#endif
8786 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008787 buf[1] = '\0';
8788 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008790
8791 onError:
8792 PyErr_SetString(PyExc_TypeError,
8793 "%c requires int or char");
8794 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795}
8796
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008797/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8798
8799 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8800 chars are formatted. XXX This is a magic number. Each formatting
8801 routine does bounds checking to ensure no overflow, but a better
8802 solution may be to malloc a buffer of appropriate size for each
8803 format. For now, the current solution is sufficient.
8804*/
8805#define FORMATBUFLEN (size_t)120
8806
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807PyObject *PyUnicode_Format(PyObject *format,
8808 PyObject *args)
8809{
8810 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008811 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 int args_owned = 0;
8813 PyUnicodeObject *result = NULL;
8814 PyObject *dict = NULL;
8815 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008816
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 if (format == NULL || args == NULL) {
8818 PyErr_BadInternalCall();
8819 return NULL;
8820 }
8821 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008822 if (uformat == NULL)
8823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 fmt = PyUnicode_AS_UNICODE(uformat);
8825 fmtcnt = PyUnicode_GET_SIZE(uformat);
8826
8827 reslen = rescnt = fmtcnt + 100;
8828 result = _PyUnicode_New(reslen);
8829 if (result == NULL)
8830 goto onError;
8831 res = PyUnicode_AS_UNICODE(result);
8832
8833 if (PyTuple_Check(args)) {
8834 arglen = PyTuple_Size(args);
8835 argidx = 0;
8836 }
8837 else {
8838 arglen = -1;
8839 argidx = -2;
8840 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008841 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008842 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 dict = args;
8844
8845 while (--fmtcnt >= 0) {
8846 if (*fmt != '%') {
8847 if (--rescnt < 0) {
8848 rescnt = fmtcnt + 100;
8849 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008850 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008851 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8853 --rescnt;
8854 }
8855 *res++ = *fmt++;
8856 }
8857 else {
8858 /* Got a format specifier */
8859 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008860 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 Py_UNICODE c = '\0';
8863 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008864 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 PyObject *v = NULL;
8866 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008867 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008869 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008870 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
8872 fmt++;
8873 if (*fmt == '(') {
8874 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008875 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 PyObject *key;
8877 int pcount = 1;
8878
8879 if (dict == NULL) {
8880 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008881 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 goto onError;
8883 }
8884 ++fmt;
8885 --fmtcnt;
8886 keystart = fmt;
8887 /* Skip over balanced parentheses */
8888 while (pcount > 0 && --fmtcnt >= 0) {
8889 if (*fmt == ')')
8890 --pcount;
8891 else if (*fmt == '(')
8892 ++pcount;
8893 fmt++;
8894 }
8895 keylen = fmt - keystart - 1;
8896 if (fmtcnt < 0 || pcount > 0) {
8897 PyErr_SetString(PyExc_ValueError,
8898 "incomplete format key");
8899 goto onError;
8900 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008901#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008902 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 then looked up since Python uses strings to hold
8904 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008905 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906 key = PyUnicode_EncodeUTF8(keystart,
8907 keylen,
8908 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008909#else
8910 key = PyUnicode_FromUnicode(keystart, keylen);
8911#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 if (key == NULL)
8913 goto onError;
8914 if (args_owned) {
8915 Py_DECREF(args);
8916 args_owned = 0;
8917 }
8918 args = PyObject_GetItem(dict, key);
8919 Py_DECREF(key);
8920 if (args == NULL) {
8921 goto onError;
8922 }
8923 args_owned = 1;
8924 arglen = -1;
8925 argidx = -2;
8926 }
8927 while (--fmtcnt >= 0) {
8928 switch (c = *fmt++) {
8929 case '-': flags |= F_LJUST; continue;
8930 case '+': flags |= F_SIGN; continue;
8931 case ' ': flags |= F_BLANK; continue;
8932 case '#': flags |= F_ALT; continue;
8933 case '0': flags |= F_ZERO; continue;
8934 }
8935 break;
8936 }
8937 if (c == '*') {
8938 v = getnextarg(args, arglen, &argidx);
8939 if (v == NULL)
8940 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008941 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 PyErr_SetString(PyExc_TypeError,
8943 "* wants int");
8944 goto onError;
8945 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008946 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008947 if (width == -1 && PyErr_Occurred())
8948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 if (width < 0) {
8950 flags |= F_LJUST;
8951 width = -width;
8952 }
8953 if (--fmtcnt >= 0)
8954 c = *fmt++;
8955 }
8956 else if (c >= '0' && c <= '9') {
8957 width = c - '0';
8958 while (--fmtcnt >= 0) {
8959 c = *fmt++;
8960 if (c < '0' || c > '9')
8961 break;
8962 if ((width*10) / 10 != width) {
8963 PyErr_SetString(PyExc_ValueError,
8964 "width too big");
8965 goto onError;
8966 }
8967 width = width*10 + (c - '0');
8968 }
8969 }
8970 if (c == '.') {
8971 prec = 0;
8972 if (--fmtcnt >= 0)
8973 c = *fmt++;
8974 if (c == '*') {
8975 v = getnextarg(args, arglen, &argidx);
8976 if (v == NULL)
8977 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008978 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 PyErr_SetString(PyExc_TypeError,
8980 "* wants int");
8981 goto onError;
8982 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008983 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008984 if (prec == -1 && PyErr_Occurred())
8985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 if (prec < 0)
8987 prec = 0;
8988 if (--fmtcnt >= 0)
8989 c = *fmt++;
8990 }
8991 else if (c >= '0' && c <= '9') {
8992 prec = c - '0';
8993 while (--fmtcnt >= 0) {
8994 c = Py_CHARMASK(*fmt++);
8995 if (c < '0' || c > '9')
8996 break;
8997 if ((prec*10) / 10 != prec) {
8998 PyErr_SetString(PyExc_ValueError,
8999 "prec too big");
9000 goto onError;
9001 }
9002 prec = prec*10 + (c - '0');
9003 }
9004 }
9005 } /* prec */
9006 if (fmtcnt >= 0) {
9007 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 if (--fmtcnt >= 0)
9009 c = *fmt++;
9010 }
9011 }
9012 if (fmtcnt < 0) {
9013 PyErr_SetString(PyExc_ValueError,
9014 "incomplete format");
9015 goto onError;
9016 }
9017 if (c != '%') {
9018 v = getnextarg(args, arglen, &argidx);
9019 if (v == NULL)
9020 goto onError;
9021 }
9022 sign = 0;
9023 fill = ' ';
9024 switch (c) {
9025
9026 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009027 pbuf = formatbuf;
9028 /* presume that buffer length is at least 1 */
9029 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 len = 1;
9031 break;
9032
9033 case 's':
9034 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009035 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 if (PyUnicode_Check(v) && c == 's') {
9037 temp = v;
9038 Py_INCREF(temp);
9039 }
9040 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009042 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009043 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009045 else
9046 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 if (temp == NULL)
9048 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009049 if (PyUnicode_Check(temp))
9050 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009051 else {
9052 Py_DECREF(temp);
9053 PyErr_SetString(PyExc_TypeError,
9054 "%s argument has non-string str()");
9055 goto onError;
9056 }
9057 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009058 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 len = PyUnicode_GET_SIZE(temp);
9060 if (prec >= 0 && len > prec)
9061 len = prec;
9062 break;
9063
9064 case 'i':
9065 case 'd':
9066 case 'u':
9067 case 'o':
9068 case 'x':
9069 case 'X':
9070 if (c == 'i')
9071 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009072 isnumok = 0;
9073 if (PyNumber_Check(v)) {
9074 PyObject *iobj=NULL;
9075
9076 if (PyLong_Check(v)) {
9077 iobj = v;
9078 Py_INCREF(iobj);
9079 }
9080 else {
9081 iobj = PyNumber_Long(v);
9082 }
9083 if (iobj!=NULL) {
9084 if (PyLong_Check(iobj)) {
9085 isnumok = 1;
9086 temp = formatlong(iobj, flags, prec, c);
9087 Py_DECREF(iobj);
9088 if (!temp)
9089 goto onError;
9090 pbuf = PyUnicode_AS_UNICODE(temp);
9091 len = PyUnicode_GET_SIZE(temp);
9092 sign = 1;
9093 }
9094 else {
9095 Py_DECREF(iobj);
9096 }
9097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009099 if (!isnumok) {
9100 PyErr_Format(PyExc_TypeError,
9101 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009102 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009103 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009104 }
9105 if (flags & F_ZERO)
9106 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 break;
9108
9109 case 'e':
9110 case 'E':
9111 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009112 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 case 'g':
9114 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009115 if (c == 'F')
9116 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009117 pbuf = formatbuf;
9118 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9119 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 if (len < 0)
9121 goto onError;
9122 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009123 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 fill = '0';
9125 break;
9126
9127 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009128 pbuf = formatbuf;
9129 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 if (len < 0)
9131 goto onError;
9132 break;
9133
9134 default:
9135 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009136 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009137 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009138 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009139 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009140 (Py_ssize_t)(fmt - 1 -
9141 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 goto onError;
9143 }
9144 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009145 if (*pbuf == '-' || *pbuf == '+') {
9146 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 len--;
9148 }
9149 else if (flags & F_SIGN)
9150 sign = '+';
9151 else if (flags & F_BLANK)
9152 sign = ' ';
9153 else
9154 sign = 0;
9155 }
9156 if (width < len)
9157 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009158 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159 reslen -= rescnt;
9160 rescnt = width + fmtcnt + 100;
9161 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009162 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009163 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009164 PyErr_NoMemory();
9165 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009166 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009167 if (_PyUnicode_Resize(&result, reslen) < 0) {
9168 Py_XDECREF(temp);
9169 goto onError;
9170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 res = PyUnicode_AS_UNICODE(result)
9172 + reslen - rescnt;
9173 }
9174 if (sign) {
9175 if (fill != ' ')
9176 *res++ = sign;
9177 rescnt--;
9178 if (width > len)
9179 width--;
9180 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009181 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009182 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009183 assert(pbuf[1] == c);
9184 if (fill != ' ') {
9185 *res++ = *pbuf++;
9186 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009187 }
Tim Petersfff53252001-04-12 18:38:48 +00009188 rescnt -= 2;
9189 width -= 2;
9190 if (width < 0)
9191 width = 0;
9192 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 if (width > len && !(flags & F_LJUST)) {
9195 do {
9196 --rescnt;
9197 *res++ = fill;
9198 } while (--width > len);
9199 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009200 if (fill == ' ') {
9201 if (sign)
9202 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009203 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009204 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009205 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009206 *res++ = *pbuf++;
9207 *res++ = *pbuf++;
9208 }
9209 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009210 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 res += len;
9212 rescnt -= len;
9213 while (--width >= len) {
9214 --rescnt;
9215 *res++ = ' ';
9216 }
9217 if (dict && (argidx < arglen) && c != '%') {
9218 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009219 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009220 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 goto onError;
9222 }
9223 Py_XDECREF(temp);
9224 } /* '%' */
9225 } /* until end */
9226 if (argidx < arglen && !dict) {
9227 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009228 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 goto onError;
9230 }
9231
Thomas Woutersa96affe2006-03-12 00:29:36 +00009232 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 if (args_owned) {
9235 Py_DECREF(args);
9236 }
9237 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 return (PyObject *)result;
9239
9240 onError:
9241 Py_XDECREF(result);
9242 Py_DECREF(uformat);
9243 if (args_owned) {
9244 Py_DECREF(args);
9245 }
9246 return NULL;
9247}
9248
Jeremy Hylton938ace62002-07-17 16:30:39 +00009249static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009250unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9251
Tim Peters6d6c1a32001-08-02 04:15:00 +00009252static PyObject *
9253unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9254{
9255 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009256 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009257 char *encoding = NULL;
9258 char *errors = NULL;
9259
Guido van Rossume023fe02001-08-30 03:12:59 +00009260 if (type != &PyUnicode_Type)
9261 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009262 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009263 kwlist, &x, &encoding, &errors))
9264 return NULL;
9265 if (x == NULL)
9266 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009267 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009268 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009269 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009270 return PyUnicode_FromEncodedObject(x, encoding, errors);
9271}
9272
Guido van Rossume023fe02001-08-30 03:12:59 +00009273static PyObject *
9274unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9275{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009276 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009277 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009278
9279 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9280 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9281 if (tmp == NULL)
9282 return NULL;
9283 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009284 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009285 if (pnew == NULL) {
9286 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009287 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009288 }
Christian Heimesb186d002008-03-18 15:15:01 +00009289 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009290 if (pnew->str == NULL) {
9291 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009292 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009293 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009294 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009295 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009296 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9297 pnew->length = n;
9298 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009299 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009300 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009301}
9302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009303PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009304"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009305\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009306Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009307encoding defaults to the current default string encoding.\n\
9308errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009309
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009310static PyObject *unicode_iter(PyObject *seq);
9311
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009313 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009314 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315 sizeof(PyUnicodeObject), /* tp_size */
9316 0, /* tp_itemsize */
9317 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009318 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009320 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009322 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009323 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009324 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009326 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 (hashfunc) unicode_hash, /* tp_hash*/
9328 0, /* tp_call*/
9329 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009330 PyObject_GenericGetAttr, /* tp_getattro */
9331 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009332 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009333 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9334 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009335 unicode_doc, /* tp_doc */
9336 0, /* tp_traverse */
9337 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009338 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009339 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009340 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009341 0, /* tp_iternext */
9342 unicode_methods, /* tp_methods */
9343 0, /* tp_members */
9344 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009345 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009346 0, /* tp_dict */
9347 0, /* tp_descr_get */
9348 0, /* tp_descr_set */
9349 0, /* tp_dictoffset */
9350 0, /* tp_init */
9351 0, /* tp_alloc */
9352 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009353 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354};
9355
9356/* Initialize the Unicode implementation */
9357
Thomas Wouters78890102000-07-22 19:25:51 +00009358void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009360 int i;
9361
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 /* XXX - move this array to unicodectype.c ? */
9363 Py_UNICODE linebreak[] = {
9364 0x000A, /* LINE FEED */
9365 0x000D, /* CARRIAGE RETURN */
9366 0x001C, /* FILE SEPARATOR */
9367 0x001D, /* GROUP SEPARATOR */
9368 0x001E, /* RECORD SEPARATOR */
9369 0x0085, /* NEXT LINE */
9370 0x2028, /* LINE SEPARATOR */
9371 0x2029, /* PARAGRAPH SEPARATOR */
9372 };
9373
Fred Drakee4315f52000-05-09 19:53:39 +00009374 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009375 free_list = NULL;
9376 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009378 if (!unicode_empty)
9379 return;
9380
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009381 for (i = 0; i < 256; i++)
9382 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009383 if (PyType_Ready(&PyUnicode_Type) < 0)
9384 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009385
9386 /* initialize the linebreak bloom filter */
9387 bloom_linebreak = make_bloom_mask(
9388 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9389 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009390
9391 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392}
9393
9394/* Finalize the Unicode implementation */
9395
Christian Heimesa156e092008-02-16 07:38:31 +00009396int
9397PyUnicode_ClearFreeList(void)
9398{
9399 int freelist_size = numfree;
9400 PyUnicodeObject *u;
9401
9402 for (u = free_list; u != NULL;) {
9403 PyUnicodeObject *v = u;
9404 u = *(PyUnicodeObject **)u;
9405 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009406 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009407 Py_XDECREF(v->defenc);
9408 PyObject_Del(v);
9409 numfree--;
9410 }
9411 free_list = NULL;
9412 assert(numfree == 0);
9413 return freelist_size;
9414}
9415
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416void
Thomas Wouters78890102000-07-22 19:25:51 +00009417_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009419 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009421 Py_XDECREF(unicode_empty);
9422 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009423
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009424 for (i = 0; i < 256; i++) {
9425 if (unicode_latin1[i]) {
9426 Py_DECREF(unicode_latin1[i]);
9427 unicode_latin1[i] = NULL;
9428 }
9429 }
Christian Heimesa156e092008-02-16 07:38:31 +00009430 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009432
Walter Dörwald16807132007-05-25 13:52:07 +00009433void
9434PyUnicode_InternInPlace(PyObject **p)
9435{
9436 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9437 PyObject *t;
9438 if (s == NULL || !PyUnicode_Check(s))
9439 Py_FatalError(
9440 "PyUnicode_InternInPlace: unicode strings only please!");
9441 /* If it's a subclass, we don't really know what putting
9442 it in the interned dict might do. */
9443 if (!PyUnicode_CheckExact(s))
9444 return;
9445 if (PyUnicode_CHECK_INTERNED(s))
9446 return;
9447 if (interned == NULL) {
9448 interned = PyDict_New();
9449 if (interned == NULL) {
9450 PyErr_Clear(); /* Don't leave an exception */
9451 return;
9452 }
9453 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009454 /* It might be that the GetItem call fails even
9455 though the key is present in the dictionary,
9456 namely when this happens during a stack overflow. */
9457 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009458 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009459 Py_END_ALLOW_RECURSION
9460
Walter Dörwald16807132007-05-25 13:52:07 +00009461 if (t) {
9462 Py_INCREF(t);
9463 Py_DECREF(*p);
9464 *p = t;
9465 return;
9466 }
9467
Martin v. Löwis5b222132007-06-10 09:51:05 +00009468 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009469 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9470 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009471 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009472 return;
9473 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009474 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009475 /* The two references in interned are not counted by refcnt.
9476 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009477 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009478 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9479}
9480
9481void
9482PyUnicode_InternImmortal(PyObject **p)
9483{
9484 PyUnicode_InternInPlace(p);
9485 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9486 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9487 Py_INCREF(*p);
9488 }
9489}
9490
9491PyObject *
9492PyUnicode_InternFromString(const char *cp)
9493{
9494 PyObject *s = PyUnicode_FromString(cp);
9495 if (s == NULL)
9496 return NULL;
9497 PyUnicode_InternInPlace(&s);
9498 return s;
9499}
9500
9501void _Py_ReleaseInternedUnicodeStrings(void)
9502{
9503 PyObject *keys;
9504 PyUnicodeObject *s;
9505 Py_ssize_t i, n;
9506 Py_ssize_t immortal_size = 0, mortal_size = 0;
9507
9508 if (interned == NULL || !PyDict_Check(interned))
9509 return;
9510 keys = PyDict_Keys(interned);
9511 if (keys == NULL || !PyList_Check(keys)) {
9512 PyErr_Clear();
9513 return;
9514 }
9515
9516 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9517 detector, interned unicode strings are not forcibly deallocated;
9518 rather, we give them their stolen references back, and then clear
9519 and DECREF the interned dict. */
9520
9521 n = PyList_GET_SIZE(keys);
9522 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9523 n);
9524 for (i = 0; i < n; i++) {
9525 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9526 switch (s->state) {
9527 case SSTATE_NOT_INTERNED:
9528 /* XXX Shouldn't happen */
9529 break;
9530 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009531 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009532 immortal_size += s->length;
9533 break;
9534 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009535 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009536 mortal_size += s->length;
9537 break;
9538 default:
9539 Py_FatalError("Inconsistent interned string state.");
9540 }
9541 s->state = SSTATE_NOT_INTERNED;
9542 }
9543 fprintf(stderr, "total size of all interned strings: "
9544 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9545 "mortal/immortal\n", mortal_size, immortal_size);
9546 Py_DECREF(keys);
9547 PyDict_Clear(interned);
9548 Py_DECREF(interned);
9549 interned = NULL;
9550}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009551
9552
9553/********************* Unicode Iterator **************************/
9554
9555typedef struct {
9556 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009557 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009558 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9559} unicodeiterobject;
9560
9561static void
9562unicodeiter_dealloc(unicodeiterobject *it)
9563{
9564 _PyObject_GC_UNTRACK(it);
9565 Py_XDECREF(it->it_seq);
9566 PyObject_GC_Del(it);
9567}
9568
9569static int
9570unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9571{
9572 Py_VISIT(it->it_seq);
9573 return 0;
9574}
9575
9576static PyObject *
9577unicodeiter_next(unicodeiterobject *it)
9578{
9579 PyUnicodeObject *seq;
9580 PyObject *item;
9581
9582 assert(it != NULL);
9583 seq = it->it_seq;
9584 if (seq == NULL)
9585 return NULL;
9586 assert(PyUnicode_Check(seq));
9587
9588 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009589 item = PyUnicode_FromUnicode(
9590 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009591 if (item != NULL)
9592 ++it->it_index;
9593 return item;
9594 }
9595
9596 Py_DECREF(seq);
9597 it->it_seq = NULL;
9598 return NULL;
9599}
9600
9601static PyObject *
9602unicodeiter_len(unicodeiterobject *it)
9603{
9604 Py_ssize_t len = 0;
9605 if (it->it_seq)
9606 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009607 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009608}
9609
9610PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9611
9612static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009613 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9614 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009615 {NULL, NULL} /* sentinel */
9616};
9617
9618PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009619 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009620 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009621 sizeof(unicodeiterobject), /* tp_basicsize */
9622 0, /* tp_itemsize */
9623 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009624 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009625 0, /* tp_print */
9626 0, /* tp_getattr */
9627 0, /* tp_setattr */
9628 0, /* tp_compare */
9629 0, /* tp_repr */
9630 0, /* tp_as_number */
9631 0, /* tp_as_sequence */
9632 0, /* tp_as_mapping */
9633 0, /* tp_hash */
9634 0, /* tp_call */
9635 0, /* tp_str */
9636 PyObject_GenericGetAttr, /* tp_getattro */
9637 0, /* tp_setattro */
9638 0, /* tp_as_buffer */
9639 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9640 0, /* tp_doc */
9641 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9642 0, /* tp_clear */
9643 0, /* tp_richcompare */
9644 0, /* tp_weaklistoffset */
9645 PyObject_SelfIter, /* tp_iter */
9646 (iternextfunc)unicodeiter_next, /* tp_iternext */
9647 unicodeiter_methods, /* tp_methods */
9648 0,
9649};
9650
9651static PyObject *
9652unicode_iter(PyObject *seq)
9653{
9654 unicodeiterobject *it;
9655
9656 if (!PyUnicode_Check(seq)) {
9657 PyErr_BadInternalCall();
9658 return NULL;
9659 }
9660 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9661 if (it == NULL)
9662 return NULL;
9663 it->it_index = 0;
9664 Py_INCREF(seq);
9665 it->it_seq = (PyUnicodeObject *)seq;
9666 _PyObject_GC_TRACK(it);
9667 return (PyObject *)it;
9668}
9669
Martin v. Löwis5b222132007-06-10 09:51:05 +00009670size_t
9671Py_UNICODE_strlen(const Py_UNICODE *u)
9672{
9673 int res = 0;
9674 while(*u++)
9675 res++;
9676 return res;
9677}
9678
9679Py_UNICODE*
9680Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9681{
9682 Py_UNICODE *u = s1;
9683 while ((*u++ = *s2++));
9684 return s1;
9685}
9686
9687Py_UNICODE*
9688Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9689{
9690 Py_UNICODE *u = s1;
9691 while ((*u++ = *s2++))
9692 if (n-- == 0)
9693 break;
9694 return s1;
9695}
9696
9697int
9698Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9699{
9700 while (*s1 && *s2 && *s1 == *s2)
9701 s1++, s2++;
9702 if (*s1 && *s2)
9703 return (*s1 < *s2) ? -1 : +1;
9704 if (*s1)
9705 return 1;
9706 if (*s2)
9707 return -1;
9708 return 0;
9709}
9710
9711Py_UNICODE*
9712Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9713{
9714 const Py_UNICODE *p;
9715 for (p = s; *p; p++)
9716 if (*p == c)
9717 return (Py_UNICODE*)p;
9718 return NULL;
9719}
9720
9721
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009722#ifdef __cplusplus
9723}
9724#endif
9725
9726
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009727/*
9728Local variables:
9729c-basic-offset: 4
9730indent-tabs-mode: nil
9731End:
9732*/