blob: 4a5aec7de88a6bd5a5e3463fb5a6d5666d6cb8b7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
428 PyErr_BadInternalCall();
429 return -1;
430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 PyErr_BadInternalCall();
434 return -1;
435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000447 Py_DECREF(*unicode);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000448 *unicode = w;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 return 0;
450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
475 }
476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 if (!unicode)
484 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000485 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
510 "Negative size passed to PyUnicode_FromStringAndSize");
511 return NULL;
512 }
513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
524 }
525
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000529 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000534 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566{
567 PyUnicodeObject *unicode;
568
569 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000570 if (size == 0)
571 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572 PyErr_BadInternalCall();
573 return NULL;
574 }
575
Martin v. Löwis790465f2008-04-05 20:41:37 +0000576 if (size == -1) {
577 size = wcslen(w);
578 }
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 unicode = _PyUnicode_New(size);
581 if (!unicode)
582 return NULL;
583
584 /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000587#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 {
589 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000592 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 *u++ = *w++;
594 }
595#endif
596
597 return (PyObject *)unicode;
598}
599
Walter Dörwald346737f2007-05-31 10:44:43 +0000600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603 *fmt++ = '%';
604 if (width) {
605 if (zeropad)
606 *fmt++ = '0';
607 fmt += sprintf(fmt, "%d", width);
608 }
609 if (precision)
610 fmt += sprintf(fmt, ".%d", precision);
611 if (longflag)
612 *fmt++ = 'l';
613 else if (size_tflag) {
614 char *f = PY_FORMAT_SIZE_T;
615 while (*f)
616 *fmt++ = *f++;
617 }
618 *fmt++ = c;
619 *fmt = '\0';
620}
621
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000628 Py_ssize_t callcount = 0;
629 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000630 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000631 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000632 int width = 0;
633 int precision = 0;
634 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000635 const char* f;
636 Py_UNICODE *s;
637 PyObject *string;
638 /* used by sprintf */
639 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000640 /* use abuffer instead of buffer, if we need more space
641 * (which can happen if there's a format specifier with width). */
642 char *abuffer = NULL;
643 char *realbuffer;
644 Py_ssize_t abuffersize = 0;
645 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646 const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649 Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef __va_copy
652 __va_copy(count, vargs);
653#else
654 count = vargs;
655#endif
656#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000657 /* step 1: count the number of %S/%R/%A format specifications
658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659 * these objects once during step 3 and put the result in
660 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000663 ++callcount;
664 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000665 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000666 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000669 if (!callresults) {
670 PyErr_NoMemory();
671 return NULL;
672 }
673 callresult = callresults;
674 }
675 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 for (f = format; *f; f++) {
677 if (*f == '%') {
678 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000681 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683 ;
684
685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686 * they don't affect the amount of space we reserve.
687 */
688 if ((*f == 'l' || *f == 'z') &&
689 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000690 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691
692 switch (*f) {
693 case 'c':
694 (void)va_arg(count, int);
695 /* fall through... */
696 case '%':
697 n++;
698 break;
699 case 'd': case 'u': case 'i': case 'x':
700 (void) va_arg(count, int);
701 /* 20 bytes is enough to hold a 64-bit
702 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000703 This isn't enough for octal.
704 If a width is specified we need more
705 (which we allocate later). */
706 if (width < 20)
707 width = 20;
708 n += width;
709 if (abuffersize < width)
710 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711 break;
712 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000713 {
714 /* UTF-8 */
715 unsigned char*s;
716 s = va_arg(count, unsigned char*);
717 while (*s) {
718 if (*s < 128) {
719 n++; s++;
720 } else if (*s < 0xc0) {
721 /* invalid UTF-8 */
722 n++; s++;
723 } else if (*s < 0xc0) {
724 n++;
725 s++; if(!*s)break;
726 s++;
727 } else if (*s < 0xe0) {
728 n++;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 } else {
733 #ifdef Py_UNICODE_WIDE
734 n++;
735 #else
736 n+=2;
737 #endif
738 s++; if(!*s)break;
739 s++; if(!*s)break;
740 s++; if(!*s)break;
741 s++;
742 }
743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000745 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 case 'U':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 assert(obj && PyUnicode_Check(obj));
750 n += PyUnicode_GET_SIZE(obj);
751 break;
752 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000753 case 'V':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 const char *str = va_arg(count, const char *);
757 assert(obj || str);
758 assert(!obj || PyUnicode_Check(obj));
759 if (obj)
760 n += PyUnicode_GET_SIZE(obj);
761 else
762 n += strlen(str);
763 break;
764 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000765 case 'S':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 PyObject *str;
769 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000770 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000771 if (!str)
772 goto fail;
773 n += PyUnicode_GET_SIZE(str);
774 /* Remember the str and switch to the next slot */
775 *callresult++ = str;
776 break;
777 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000778 case 'R':
779 {
780 PyObject *obj = va_arg(count, PyObject *);
781 PyObject *repr;
782 assert(obj);
783 repr = PyObject_Repr(obj);
784 if (!repr)
785 goto fail;
786 n += PyUnicode_GET_SIZE(repr);
787 /* Remember the repr and switch to the next slot */
788 *callresult++ = repr;
789 break;
790 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000791 case 'A':
792 {
793 PyObject *obj = va_arg(count, PyObject *);
794 PyObject *ascii;
795 assert(obj);
796 ascii = PyObject_ASCII(obj);
797 if (!ascii)
798 goto fail;
799 n += PyUnicode_GET_SIZE(ascii);
800 /* Remember the repr and switch to the next slot */
801 *callresult++ = ascii;
802 break;
803 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 case 'p':
805 (void) va_arg(count, int);
806 /* maximum 64-bit pointer representation:
807 * 0xffffffffffffffff
808 * so 19 characters is enough.
809 * XXX I count 18 -- what's the extra for?
810 */
811 n += 19;
812 break;
813 default:
814 /* if we stumble upon an unknown
815 formatting code, copy the rest of
816 the format string to the output
817 string. (we cannot just skip the
818 code, since there's no way to know
819 what's in the argument list) */
820 n += strlen(p);
821 goto expand;
822 }
823 } else
824 n++;
825 }
826 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000828 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000829 if (!abuffer) {
830 PyErr_NoMemory();
831 goto fail;
832 }
833 realbuffer = abuffer;
834 }
835 else
836 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 we don't have to resize the string.
840 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 string = PyUnicode_FromUnicode(NULL, n);
842 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844
845 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000846 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847
848 for (f = format; *f; f++) {
849 if (*f == '%') {
850 const char* p = f++;
851 int longflag = 0;
852 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 zeropad = (*f == '0');
854 /* parse the width.precision part */
855 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000856 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 width = (width*10) + *f++ - '0';
858 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (*f == '.') {
860 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000861 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 /* handle the long flag, but only for %ld and %lu.
865 others can be added when necessary. */
866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867 longflag = 1;
868 ++f;
869 }
870 /* handle the size_t flag. */
871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872 size_tflag = 1;
873 ++f;
874 }
875
876 switch (*f) {
877 case 'c':
878 *s++ = va_arg(vargs, int);
879 break;
880 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000887 sprintf(realbuffer, fmt, va_arg(vargs, int));
888 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 break;
890 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
905 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 break;
910 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000911 {
912 /* Parameter must be UTF-8 encoded.
913 In case of encoding errors, use
914 the replacement character. */
915 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000917 u = PyUnicode_DecodeUTF8(p, strlen(p),
918 "replace");
919 if (!u)
920 goto fail;
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922 PyUnicode_GET_SIZE(u));
923 s += PyUnicode_GET_SIZE(u);
924 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000926 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000927 case 'U':
928 {
929 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000933 break;
934 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000935 case 'V':
936 {
937 PyObject *obj = va_arg(vargs, PyObject *);
938 const char *str = va_arg(vargs, const char *);
939 if (obj) {
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 } else {
944 appendstring(str);
945 }
946 break;
947 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000948 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000949 case 'R':
950 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000951 Py_UNICODE *ucopy;
952 Py_ssize_t usize;
953 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 /* unused, since we already have the result */
955 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000956 ucopy = PyUnicode_AS_UNICODE(*callresult);
957 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 for (upos = 0; upos<usize;)
959 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000962 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000963 ++callresult;
964 break;
965 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966 case 'p':
967 sprintf(buffer, "%p", va_arg(vargs, void*));
968 /* %p is ill-defined: ensure leading 0x. */
969 if (buffer[1] == 'X')
970 buffer[1] = 'x';
971 else if (buffer[1] != 'x') {
972 memmove(buffer+2, buffer, strlen(buffer)+1);
973 buffer[0] = '0';
974 buffer[1] = 'x';
975 }
976 appendstring(buffer);
977 break;
978 case '%':
979 *s++ = '%';
980 break;
981 default:
982 appendstring(p);
983 goto end;
984 }
985 } else
986 *s++ = *f;
987 }
988
989 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000990 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000992 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000993 PyObject_Free(abuffer);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000996 fail:
997 if (callresults) {
998 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000999 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 Py_DECREF(*callresult2);
1001 ++callresult2;
1002 }
Christian Heimesb186d002008-03-18 15:15:01 +00001003 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001004 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001005 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001006 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001007 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015 PyObject* ret;
1016 va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019 va_start(vargs, format);
1020#else
1021 va_start(vargs);
1022#endif
1023 ret = PyUnicode_FromFormatV(format, vargs);
1024 va_end(vargs);
1025 return ret;
1026}
1027
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029 wchar_t *w,
1030 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031{
1032 if (unicode == NULL) {
1033 PyErr_BadInternalCall();
1034 return -1;
1035 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001036
1037 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039 size = PyUnicode_GET_SIZE(unicode) + 1;
1040
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041#ifdef HAVE_USABLE_WCHAR_T
1042 memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044 {
1045 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001046 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001048 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 *w++ = *u++;
1050 }
1051#endif
1052
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001053 if (size > PyUnicode_GET_SIZE(unicode))
1054 return PyUnicode_GET_SIZE(unicode);
1055 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 return size;
1057}
1058
1059#endif
1060
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065 if (ordinal < 0 || ordinal > 0x10ffff) {
1066 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001067 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 return NULL;
1069 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001070
1071#ifndef Py_UNICODE_WIDE
1072 if (ordinal > 0xffff) {
1073 ordinal -= 0x10000;
1074 s[0] = 0xD800 | (ordinal >> 10);
1075 s[1] = 0xDC00 | (ordinal & 0x3FF);
1076 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#endif
1079
Hye-Shik Chang40574832004-04-06 07:24:51 +00001080 s[0] = (Py_UNICODE)ordinal;
1081 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001082}
1083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 if (PyUnicode_CheckExact(obj)) {
1089 Py_INCREF(obj);
1090 return obj;
1091 }
1092 if (PyUnicode_Check(obj)) {
1093 /* For a Unicode subtype that's not a Unicode object,
1094 return a true Unicode object with the same data. */
1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096 PyUnicode_GET_SIZE(obj));
1097 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001098 PyErr_Format(PyExc_TypeError,
1099 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001100 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001101 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105 const char *encoding,
1106 const char *errors)
1107{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (obj == NULL) {
1113 PyErr_BadInternalCall();
1114 return NULL;
1115 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001117 if (PyUnicode_Check(obj)) {
1118 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001119 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122
1123 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001124 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001125 s = PyBytes_AS_STRING(obj);
1126 len = PyBytes_GET_SIZE(obj);
1127 }
1128 else if (PyByteArray_Check(obj)) {
1129 s = PyByteArray_AS_STRING(obj);
1130 len = PyByteArray_GET_SIZE(obj);
1131 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001132 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133 /* Overwrite the error message with something more useful in
1134 case of a TypeError. */
1135 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001137 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001138 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001139 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001140 goto onError;
1141 }
Tim Petersced69f82003-09-16 20:30:58 +00001142
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001143 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 if (len == 0) {
1145 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001146 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 }
Tim Petersced69f82003-09-16 20:30:58 +00001148 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001150
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001151 return v;
1152
1153 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 const char *encoding,
1160 const char *errors)
1161{
1162 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001163 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 char lower[20]; /* Enough for any encoding name we recognize */
1165 char *l;
1166 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
1168 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 encoding = PyUnicode_GetDefaultEncoding();
1170
1171 /* Convert encoding to lower case and replace '_' with '-' in order to
1172 catch e.g. UTF_8 */
1173 e = encoding;
1174 l = lower;
1175 while (*e && l < &lower[(sizeof lower) - 2]) {
1176 if (ISUPPER(*e)) {
1177 *l++ = TOLOWER(*e++);
1178 }
1179 else if (*e == '_') {
1180 *l++ = '-';
1181 e++;
1182 }
1183 else {
1184 *l++ = *e++;
1185 }
1186 }
1187 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001188
1189 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if ((strcmp(lower, "latin-1") == 0) ||
1193 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001196 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001201 else if (strcmp(lower, "utf-16") == 0)
1202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203 else if (strcmp(lower, "utf-32") == 0)
1204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (buffer == NULL)
1212 goto onError;
1213 unicode = PyCodec_Decode(buffer, encoding, errors);
1214 if (unicode == NULL)
1215 goto onError;
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_DECREF(unicode);
1221 goto onError;
1222 }
1223 Py_DECREF(buffer);
1224 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 onError:
1227 Py_XDECREF(buffer);
1228 return NULL;
1229}
1230
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v;
1236
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Decode via the codec registry */
1246 v = PyCodec_Decode(unicode, encoding, errors);
1247 if (v == NULL)
1248 goto onError;
1249 return v;
1250
1251 onError:
1252 return NULL;
1253}
1254
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256 const char *encoding,
1257 const char *errors)
1258{
1259 PyObject *v;
1260
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 goto onError;
1264 }
1265
1266 if (encoding == NULL)
1267 encoding = PyUnicode_GetDefaultEncoding();
1268
1269 /* Decode via the codec registry */
1270 v = PyCodec_Decode(unicode, encoding, errors);
1271 if (v == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(v)) {
1274 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001275 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001276 Py_TYPE(v)->tp_name);
1277 Py_DECREF(v);
1278 goto onError;
1279 }
1280 return v;
1281
1282 onError:
1283 return NULL;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 unicode = PyUnicode_FromUnicode(s, size);
1294 if (unicode == NULL)
1295 return NULL;
1296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297 Py_DECREF(unicode);
1298 return v;
1299}
1300
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302 const char *encoding,
1303 const char *errors)
1304{
1305 PyObject *v;
1306
1307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadArgument();
1309 goto onError;
1310 }
1311
1312 if (encoding == NULL)
1313 encoding = PyUnicode_GetDefaultEncoding();
1314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 return v;
1320
1321 onError:
1322 return NULL;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326 const char *encoding,
1327 const char *errors)
1328{
1329 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (!PyUnicode_Check(unicode)) {
1332 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 }
Fred Drakee4315f52000-05-09 19:53:39 +00001335
Tim Petersced69f82003-09-16 20:30:58 +00001336 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001337 encoding = PyUnicode_GetDefaultEncoding();
1338
1339 /* Shortcuts for common default encodings */
1340 if (errors == NULL) {
1341 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001342 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001343 else if (strcmp(encoding, "latin-1") == 0)
1344 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346 else if (strcmp(encoding, "mbcs") == 0)
1347 return PyUnicode_AsMBCSString(unicode);
1348#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001349 else if (strcmp(encoding, "ascii") == 0)
1350 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001351 /* During bootstrap, we may need to find the encodings
1352 package, to load the file system encoding, and require the
1353 file system encoding in order to load the encodings
1354 package.
1355
1356 Break out of this dependency by assuming that the path to
1357 the encodings module is ASCII-only. XXX could try wcstombs
1358 instead, if the file system encoding is the locale's
1359 encoding. */
1360 else if (Py_FileSystemDefaultEncoding &&
1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362 !PyThreadState_GET()->interp->codecs_initialized)
1363 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
1366 /* Encode via the codec registry */
1367 v = PyCodec_Encode(unicode, encoding, errors);
1368 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001369 return NULL;
1370
1371 /* The normal path */
1372 if (PyBytes_Check(v))
1373 return v;
1374
1375 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376 if (PyByteArray_Check(v)) {
1377 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001378 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001379 PyOS_snprintf(msg, sizeof(msg),
1380 "encoder %s returned buffer instead of bytes",
1381 encoding);
1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001383 Py_DECREF(v);
1384 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001385 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388 Py_DECREF(v);
1389 return b;
1390 }
1391
1392 PyErr_Format(PyExc_TypeError,
1393 "encoder did not return a bytes object (type=%.400s)",
1394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
1411 encoding = PyUnicode_GetDefaultEncoding();
1412
1413 /* Encode via the codec registry */
1414 v = PyCodec_Encode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 onError:
1427 return NULL;
1428}
1429
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431 const char *errors)
1432{
1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001434 if (v)
1435 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001436 if (errors != NULL)
1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001439 PyUnicode_GET_SIZE(unicode),
1440 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001441 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001442 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001443 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001444 return v;
1445}
1446
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001448PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001450 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001452
Christian Heimes5894ba72007-11-04 11:43:14 +00001453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457 can be undefined. If it is case, decode using UTF-8. The following assumes
1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459 bootstrapping process where the codecs aren't ready yet.
1460 */
1461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001464 return PyUnicode_DecodeMBCS(s, size, "replace");
1465 }
1466#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001468 return PyUnicode_DecodeUTF8(s, size, "replace");
1469 }
1470#endif
1471 return PyUnicode_Decode(s, size,
1472 Py_FileSystemDefaultEncoding,
1473 "replace");
1474 }
1475 else {
1476 return PyUnicode_DecodeUTF8(s, size, "replace");
1477 }
1478}
1479
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001482{
Christian Heimesf3863112007-11-22 07:46:41 +00001483 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
1486 return NULL;
1487 }
Christian Heimesf3863112007-11-22 07:46:41 +00001488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001490 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001491 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001492 *psize = PyBytes_GET_SIZE(bytes);
1493 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001494}
1495
1496char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001498{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001499 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001500}
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 goto onError;
1507 }
1508 return PyUnicode_AS_UNICODE(unicode);
1509
1510 onError:
1511 return NULL;
1512}
1513
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515{
1516 if (!PyUnicode_Check(unicode)) {
1517 PyErr_BadArgument();
1518 goto onError;
1519 }
1520 return PyUnicode_GET_SIZE(unicode);
1521
1522 onError:
1523 return -1;
1524}
1525
Thomas Wouters78890102000-07-22 19:25:51 +00001526const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001527{
1528 return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001533 if (strcmp(encoding, unicode_default_encoding) != 0) {
1534 PyErr_Format(PyExc_ValueError,
1535 "Can only set default encoding to %s",
1536 unicode_default_encoding);
1537 return -1;
1538 }
Fred Drakee4315f52000-05-09 19:53:39 +00001539 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001540}
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542/* error handling callback helper:
1543 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001544 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 and adjust various state variables.
1546 return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001552 const char **input, const char **inend, Py_ssize_t *startinpos,
1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557
1558 PyObject *restuple = NULL;
1559 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t requiredsize;
1563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001565 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 int res = -1;
1568
1569 if (*errorHandler == NULL) {
1570 *errorHandler = PyCodec_LookupError(errors);
1571 if (*errorHandler == NULL)
1572 goto onError;
1573 }
1574
1575 if (*exceptionObject == NULL) {
1576 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 if (*exceptionObject == NULL)
1579 goto onError;
1580 }
1581 else {
1582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585 goto onError;
1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587 goto onError;
1588 }
1589
1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591 if (restuple == NULL)
1592 goto onError;
1593 if (!PyTuple_Check(restuple)) {
1594 PyErr_Format(PyExc_TypeError, &argparse[4]);
1595 goto onError;
1596 }
1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
1600 /* Copy back the bytes variables, which might have been modified by the
1601 callback */
1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603 if (!inputobj)
1604 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001605 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001608 *input = PyBytes_AS_STRING(inputobj);
1609 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001610 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001611 /* we can DECREF safely, as the exception has another reference,
1612 so the object won't go away. */
1613 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001616 newpos = insize+newpos;
1617 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001619 goto onError;
1620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 /* need more space? (at least enough for what we
1623 have+the replacement+the rest of the string (starting
1624 at the new input position), so we won't have to check space
1625 when there are no errors in the rest of the string) */
1626 repptr = PyUnicode_AS_UNICODE(repunicode);
1627 repsize = PyUnicode_GET_SIZE(repunicode);
1628 requiredsize = *outpos + repsize + insize-newpos;
1629 if (requiredsize > outsize) {
1630 if (requiredsize<2*outsize)
1631 requiredsize = 2*outsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632 if (_PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 goto onError;
1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635 }
1636 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001637 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_UNICODE_COPY(*outptr, repptr, repsize);
1639 *outptr += repsize;
1640 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 /* we made it! */
1643 res = 0;
1644
1645 onError:
1646 Py_XDECREF(restuple);
1647 return res;
1648}
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
Tim Petersced69f82003-09-16 20:30:58 +00001654static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655char utf7_special[128] = {
1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657 encoded:
1658 0 - not special
1659 1 - special
1660 2 - whitespace (optional)
1661 3 - RFC2152 Set O (optional) */
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674 warnings about the comparison always being false; since
1675 utf7_special[0] is 1, we can safely make that one comparison
1676 true */
1677
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001680 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 (encodeO && (utf7_special[(c)] == 3)))
1682
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001683#define B64(n) \
1684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001686 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001687#define UB64(c) \
1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001691#define ENCODE(out, ch, bits) \
1692 while (bits >= 6) { \
1693 *out++ = B64(ch >> (bits-6)); \
1694 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001697#define DECODE(out, ch, bits, surrogate) \
1698 while (bits >= 16) { \
1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1700 bits -= 16; \
1701 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001702 /* We have already generated an error for the high surrogate \
1703 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001704 surrogate = 0; \
1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001707 it in a 16-bit character */ \
1708 surrogate = 1; \
1709 errmsg = "code pairs are not supported"; \
1710 goto utf7Error; \
1711 } else { \
1712 *out++ = outCh; \
1713 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 const char *errors)
1719{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724 Py_ssize_t size,
1725 const char *errors,
1726 Py_ssize_t *consumed)
1727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729 Py_ssize_t startinpos;
1730 Py_ssize_t endinpos;
1731 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 const char *e;
1733 PyUnicodeObject *unicode;
1734 Py_UNICODE *p;
1735 const char *errmsg = "";
1736 int inShift = 0;
1737 unsigned int bitsleft = 0;
1738 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int surrogate = 0;
1740 PyObject *errorHandler = NULL;
1741 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
1743 unicode = _PyUnicode_New(size);
1744 if (!unicode)
1745 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001746 if (size == 0) {
1747 if (consumed)
1748 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
1752 p = unicode->str;
1753 e = s + size;
1754
1755 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_UNICODE ch;
1757 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001758 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
1760 if (inShift) {
1761 if ((ch == '-') || !B64CHAR(ch)) {
1762 inShift = 0;
1763 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766 if (bitsleft >= 6) {
1767 /* The shift sequence has a partial character in it. If
1768 bitsleft < 6 then we could just classify it as padding
1769 but that is not the case here */
1770
1771 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001772 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 }
1774 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001775 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 here so indicate the potential of a misencoded character. */
1777
1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001781 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
1783
1784 if (ch == '-') {
1785 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001786 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 inShift = 1;
1788 }
1789 } else if (SPECIAL(ch,0,0)) {
1790 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 } else {
1793 *p++ = ch;
1794 }
1795 } else {
1796 charsleft = (charsleft << 6) | UB64(ch);
1797 bitsleft += 6;
1798 s++;
1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800 }
1801 }
1802 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 s++;
1805 if (s < e && *s == '-') {
1806 s++;
1807 *p++ = '+';
1808 } else
1809 {
1810 inShift = 1;
1811 bitsleft = 0;
1812 }
1813 }
1814 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001815 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 errmsg = "unexpected special character";
1817 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001818 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820 else {
1821 *p++ = ch;
1822 s++;
1823 }
1824 continue;
1825 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 outpos = p-PyUnicode_AS_UNICODE(unicode);
1827 endinpos = s-starts;
1828 if (unicode_decode_call_errorhandler(
1829 errors, &errorHandler,
1830 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001831 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001832 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 }
1835
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001836 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 outpos = p-PyUnicode_AS_UNICODE(unicode);
1838 endinpos = size;
1839 if (unicode_decode_call_errorhandler(
1840 errors, &errorHandler,
1841 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001843 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 if (s < e)
1846 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001848 if (consumed) {
1849 if(inShift)
1850 *consumed = startinpos;
1851 else
1852 *consumed = s-starts;
1853 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 return (PyObject *)unicode;
1861
1862onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 int encodeSetO,
1873 int encodeWhiteSpace,
1874 const char *errors)
1875{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001876 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 unsigned int bitsleft = 0;
1882 unsigned long charsleft = 0;
1883 char * out;
1884 char * start;
1885
1886 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001887 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001889 if (cbAllocated / 5 != size)
1890 return PyErr_NoMemory();
1891
Christian Heimes9c4756e2008-05-26 13:22:05 +00001892 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (v == NULL)
1894 return NULL;
1895
Christian Heimes9c4756e2008-05-26 13:22:05 +00001896 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 for (;i < size; ++i) {
1898 Py_UNICODE ch = s[i];
1899
1900 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001901 if (ch == '+') {
1902 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *out++ = '-';
1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905 charsleft = ch;
1906 bitsleft = 16;
1907 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 } else {
1911 *out++ = (char) ch;
1912 }
1913 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915 *out++ = B64(charsleft << (6-bitsleft));
1916 charsleft = 0;
1917 bitsleft = 0;
1918 /* Characters not in the BASE64 set implicitly unshift the sequence
1919 so no '-' is required, except if the character is itself a '-' */
1920 if (B64CHAR(ch) || ch == '-') {
1921 *out++ = '-';
1922 }
1923 inShift = 0;
1924 *out++ = (char) ch;
1925 } else {
1926 bitsleft += 16;
1927 charsleft = (charsleft << 16) | ch;
1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001931 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 or '-' then the shift sequence will be terminated implicitly and we
1933 don't have to insert a '-'. */
1934
1935 if (bitsleft == 0) {
1936 if (i + 1 < size) {
1937 Py_UNICODE ch2 = s[i+1];
1938
1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 } else if (B64CHAR(ch2) || ch2 == '-') {
1942 *out++ = '-';
1943 inShift = 0;
1944 } else {
1945 inShift = 0;
1946 }
1947
1948 }
1949 else {
1950 *out++ = '-';
1951 inShift = 0;
1952 }
1953 }
Tim Petersced69f82003-09-16 20:30:58 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (bitsleft) {
1958 *out++= B64(charsleft << (6-bitsleft) );
1959 *out++ = '-';
1960 }
1961
Christian Heimes72b710a2008-05-26 13:28:38 +00001962 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001963 Py_DECREF(v);
1964 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965}
1966
1967#undef SPECIAL
1968#undef B64
1969#undef B64CHAR
1970#undef UB64
1971#undef ENCODE
1972#undef DECODE
1973
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974/* --- UTF-8 Codec -------------------------------------------------------- */
1975
Tim Petersced69f82003-09-16 20:30:58 +00001976static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977char utf8_code_length[256] = {
1978 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1979 illegal prefix. see RFC 2279 for details */
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1994 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1995 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1996};
1997
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 const char *errors)
2001{
Walter Dörwald69652032004-09-07 20:24:22 +00002002 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2003}
2004
2005PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002007 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002008 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002010 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t startinpos;
2013 Py_ssize_t endinpos;
2014 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 const char *e;
2016 PyUnicodeObject *unicode;
2017 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002018 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Note: size will always be longer than the resulting Unicode
2023 character count */
2024 unicode = _PyUnicode_New(size);
2025 if (!unicode)
2026 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002027 if (size == 0) {
2028 if (consumed)
2029 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032
2033 /* Unpack UTF-8 encoded data */
2034 p = unicode->str;
2035 e = s + size;
2036
2037 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039
2040 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002041 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 s++;
2043 continue;
2044 }
2045
2046 n = utf8_code_length[ch];
2047
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002048 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
2050 break;
2051 else {
2052 errmsg = "unexpected end of data";
2053 startinpos = s-starts;
2054 endinpos = size;
2055 goto utf8Error;
2056 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
2059 switch (n) {
2060
2061 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002062 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 startinpos = s-starts;
2064 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002065 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
2067 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002068 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 startinpos = s-starts;
2070 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002071 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
2073 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002074 if ((s[1] & 0xc0) != 0x80) {
2075 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 startinpos = s-starts;
2077 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002078 goto utf8Error;
2079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002081 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 startinpos = s-starts;
2083 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002084 errmsg = "illegal encoding";
2085 goto utf8Error;
2086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002088 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 break;
2090
2091 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002092 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002093 (s[2] & 0xc0) != 0x80) {
2094 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 startinpos = s-starts;
2096 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002097 goto utf8Error;
2098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002100 if (ch < 0x0800) {
2101 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002102 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002103
2104 XXX For wide builds (UCS-4) we should probably try
2105 to recombine the surrogates into a single code
2106 unit.
2107 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002108 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002109 startinpos = s-starts;
2110 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002111 goto utf8Error;
2112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002114 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 break;
2116
2117 case 4:
2118 if ((s[1] & 0xc0) != 0x80 ||
2119 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002120 (s[3] & 0xc0) != 0x80) {
2121 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 startinpos = s-starts;
2123 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002124 goto utf8Error;
2125 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2128 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002130 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002132 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002133 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 startinpos = s-starts;
2136 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 goto utf8Error;
2138 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002139#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002140 *p++ = (Py_UNICODE)ch;
2141#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002143
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002144 /* translate from 10000..10FFFF to 0..FFFF */
2145 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002146
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002147 /* high surrogate = top 10 bits added to D800 */
2148 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002149
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002150 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002151 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002152#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 break;
2154
2155 default:
2156 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002157 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 startinpos = s-starts;
2159 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002160 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 }
2162 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002163 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002165 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 outpos = p-PyUnicode_AS_UNICODE(unicode);
2167 if (unicode_decode_call_errorhandler(
2168 errors, &errorHandler,
2169 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002170 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002171 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 }
Walter Dörwald69652032004-09-07 20:24:22 +00002174 if (consumed)
2175 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176
2177 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002178 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 goto onError;
2180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 Py_XDECREF(errorHandler);
2182 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return (PyObject *)unicode;
2184
2185onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002186 Py_XDECREF(errorHandler);
2187 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 Py_DECREF(unicode);
2189 return NULL;
2190}
2191
Tim Peters602f7402002-04-27 18:03:26 +00002192/* Allocation strategy: if the string is short, convert into a stack buffer
2193 and allocate exactly as much space needed at the end. Else allocate the
2194 maximum possible needed (4 result bytes per Unicode character), and return
2195 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002196*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002197PyObject *
2198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002199 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201{
Tim Peters602f7402002-04-27 18:03:26 +00002202#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002203
Guido van Rossum98297ee2007-11-06 21:34:58 +00002204 Py_ssize_t i; /* index into s of next input byte */
2205 PyObject *result; /* result string object */
2206 char *p; /* next free byte in output buffer */
2207 Py_ssize_t nallocated; /* number of result bytes allocated */
2208 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002209 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002210
Tim Peters602f7402002-04-27 18:03:26 +00002211 assert(s != NULL);
2212 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213
Tim Peters602f7402002-04-27 18:03:26 +00002214 if (size <= MAX_SHORT_UNICHARS) {
2215 /* Write into the stack buffer; nallocated can't overflow.
2216 * At the end, we'll allocate exactly as much heap space as it
2217 * turns out we need.
2218 */
2219 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002220 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002221 p = stackbuf;
2222 }
2223 else {
2224 /* Overallocate on the heap, and give the excess back at the end. */
2225 nallocated = size * 4;
2226 if (nallocated / 4 != size) /* overflow! */
2227 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002228 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002229 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002230 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002231 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002232 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002233
Tim Peters602f7402002-04-27 18:03:26 +00002234 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002235 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002236
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002237 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002238 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002240
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002242 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002243 *p++ = (char)(0xc0 | (ch >> 6));
2244 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002245 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002246 else {
Tim Peters602f7402002-04-27 18:03:26 +00002247 /* Encode UCS2 Unicode ordinals */
2248 if (ch < 0x10000) {
2249 /* Special case: check for high surrogate */
2250 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2251 Py_UCS4 ch2 = s[i];
2252 /* Check for low surrogate and combine the two to
2253 form a UCS4 value */
2254 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002255 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002256 i++;
2257 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002258 }
Tim Peters602f7402002-04-27 18:03:26 +00002259 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002260 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002262 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2263 *p++ = (char)(0x80 | (ch & 0x3f));
2264 continue;
2265 }
2266encodeUCS4:
2267 /* Encode UCS4 Unicode ordinals */
2268 *p++ = (char)(0xf0 | (ch >> 18));
2269 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2270 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2271 *p++ = (char)(0x80 | (ch & 0x3f));
2272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002274
Guido van Rossum98297ee2007-11-06 21:34:58 +00002275 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002276 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002277 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002278 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002279 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002280 }
2281 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002282 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002283 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002284 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002285 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002286 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002287 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002288
Tim Peters602f7402002-04-27 18:03:26 +00002289#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290}
2291
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2293{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 if (!PyUnicode_Check(unicode)) {
2295 PyErr_BadArgument();
2296 return NULL;
2297 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002298 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2299 PyUnicode_GET_SIZE(unicode),
2300 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301}
2302
Walter Dörwald41980ca2007-08-16 21:55:45 +00002303/* --- UTF-32 Codec ------------------------------------------------------- */
2304
2305PyObject *
2306PyUnicode_DecodeUTF32(const char *s,
2307 Py_ssize_t size,
2308 const char *errors,
2309 int *byteorder)
2310{
2311 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2312}
2313
2314PyObject *
2315PyUnicode_DecodeUTF32Stateful(const char *s,
2316 Py_ssize_t size,
2317 const char *errors,
2318 int *byteorder,
2319 Py_ssize_t *consumed)
2320{
2321 const char *starts = s;
2322 Py_ssize_t startinpos;
2323 Py_ssize_t endinpos;
2324 Py_ssize_t outpos;
2325 PyUnicodeObject *unicode;
2326 Py_UNICODE *p;
2327#ifndef Py_UNICODE_WIDE
2328 int i, pairs;
2329#else
2330 const int pairs = 0;
2331#endif
2332 const unsigned char *q, *e;
2333 int bo = 0; /* assume native ordering by default */
2334 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002335 /* Offsets from q for retrieving bytes in the right order. */
2336#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2337 int iorder[] = {0, 1, 2, 3};
2338#else
2339 int iorder[] = {3, 2, 1, 0};
2340#endif
2341 PyObject *errorHandler = NULL;
2342 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002343 /* On narrow builds we split characters outside the BMP into two
2344 codepoints => count how much extra space we need. */
2345#ifndef Py_UNICODE_WIDE
2346 for (i = pairs = 0; i < size/4; i++)
2347 if (((Py_UCS4 *)s)[i] >= 0x10000)
2348 pairs++;
2349#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002350
2351 /* This might be one to much, because of a BOM */
2352 unicode = _PyUnicode_New((size+3)/4+pairs);
2353 if (!unicode)
2354 return NULL;
2355 if (size == 0)
2356 return (PyObject *)unicode;
2357
2358 /* Unpack UTF-32 encoded data */
2359 p = unicode->str;
2360 q = (unsigned char *)s;
2361 e = q + size;
2362
2363 if (byteorder)
2364 bo = *byteorder;
2365
2366 /* Check for BOM marks (U+FEFF) in the input and adjust current
2367 byte order setting accordingly. In native mode, the leading BOM
2368 mark is skipped, in all other modes, it is copied to the output
2369 stream as-is (giving a ZWNBSP character). */
2370 if (bo == 0) {
2371 if (size >= 4) {
2372 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2373 (q[iorder[1]] << 8) | q[iorder[0]];
2374#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375 if (bom == 0x0000FEFF) {
2376 q += 4;
2377 bo = -1;
2378 }
2379 else if (bom == 0xFFFE0000) {
2380 q += 4;
2381 bo = 1;
2382 }
2383#else
2384 if (bom == 0x0000FEFF) {
2385 q += 4;
2386 bo = 1;
2387 }
2388 else if (bom == 0xFFFE0000) {
2389 q += 4;
2390 bo = -1;
2391 }
2392#endif
2393 }
2394 }
2395
2396 if (bo == -1) {
2397 /* force LE */
2398 iorder[0] = 0;
2399 iorder[1] = 1;
2400 iorder[2] = 2;
2401 iorder[3] = 3;
2402 }
2403 else if (bo == 1) {
2404 /* force BE */
2405 iorder[0] = 3;
2406 iorder[1] = 2;
2407 iorder[2] = 1;
2408 iorder[3] = 0;
2409 }
2410
2411 while (q < e) {
2412 Py_UCS4 ch;
2413 /* remaining bytes at the end? (size should be divisible by 4) */
2414 if (e-q<4) {
2415 if (consumed)
2416 break;
2417 errmsg = "truncated data";
2418 startinpos = ((const char *)q)-starts;
2419 endinpos = ((const char *)e)-starts;
2420 goto utf32Error;
2421 /* The remaining input chars are ignored if the callback
2422 chooses to skip the input */
2423 }
2424 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2425 (q[iorder[1]] << 8) | q[iorder[0]];
2426
2427 if (ch >= 0x110000)
2428 {
2429 errmsg = "codepoint not in range(0x110000)";
2430 startinpos = ((const char *)q)-starts;
2431 endinpos = startinpos+4;
2432 goto utf32Error;
2433 }
2434#ifndef Py_UNICODE_WIDE
2435 if (ch >= 0x10000)
2436 {
2437 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2438 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2439 }
2440 else
2441#endif
2442 *p++ = ch;
2443 q += 4;
2444 continue;
2445 utf32Error:
2446 outpos = p-PyUnicode_AS_UNICODE(unicode);
2447 if (unicode_decode_call_errorhandler(
2448 errors, &errorHandler,
2449 "utf32", errmsg,
2450 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002451 &unicode, &outpos, &p))
Walter Dörwald41980ca2007-08-16 21:55:45 +00002452 goto onError;
2453 }
2454
2455 if (byteorder)
2456 *byteorder = bo;
2457
2458 if (consumed)
2459 *consumed = (const char *)q-starts;
2460
2461 /* Adjust length */
2462 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2463 goto onError;
2464
2465 Py_XDECREF(errorHandler);
2466 Py_XDECREF(exc);
2467 return (PyObject *)unicode;
2468
2469onError:
2470 Py_DECREF(unicode);
2471 Py_XDECREF(errorHandler);
2472 Py_XDECREF(exc);
2473 return NULL;
2474}
2475
2476PyObject *
2477PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2478 Py_ssize_t size,
2479 const char *errors,
2480 int byteorder)
2481{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002482 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002484 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002486 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002487#else
2488 const int pairs = 0;
2489#endif
2490 /* Offsets from p for storing byte pairs in the right order. */
2491#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2492 int iorder[] = {0, 1, 2, 3};
2493#else
2494 int iorder[] = {3, 2, 1, 0};
2495#endif
2496
2497#define STORECHAR(CH) \
2498 do { \
2499 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2500 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2501 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2502 p[iorder[0]] = (CH) & 0xff; \
2503 p += 4; \
2504 } while(0)
2505
2506 /* In narrow builds we can output surrogate pairs as one codepoint,
2507 so we need less space. */
2508#ifndef Py_UNICODE_WIDE
2509 for (i = pairs = 0; i < size-1; i++)
2510 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2511 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2512 pairs++;
2513#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002514 nsize = (size - pairs + (byteorder == 0));
2515 bytesize = nsize * 4;
2516 if (bytesize / 4 != nsize)
2517 return PyErr_NoMemory();
2518 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002519 if (v == NULL)
2520 return NULL;
2521
Christian Heimes9c4756e2008-05-26 13:22:05 +00002522 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002523 if (byteorder == 0)
2524 STORECHAR(0xFEFF);
2525 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002527
2528 if (byteorder == -1) {
2529 /* force LE */
2530 iorder[0] = 0;
2531 iorder[1] = 1;
2532 iorder[2] = 2;
2533 iorder[3] = 3;
2534 }
2535 else if (byteorder == 1) {
2536 /* force BE */
2537 iorder[0] = 3;
2538 iorder[1] = 2;
2539 iorder[2] = 1;
2540 iorder[3] = 0;
2541 }
2542
2543 while (size-- > 0) {
2544 Py_UCS4 ch = *s++;
2545#ifndef Py_UNICODE_WIDE
2546 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2547 Py_UCS4 ch2 = *s;
2548 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2549 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2550 s++;
2551 size--;
2552 }
2553 }
2554#endif
2555 STORECHAR(ch);
2556 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002557
2558 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002559 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002560 Py_DECREF(v);
2561 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562#undef STORECHAR
2563}
2564
2565PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2566{
2567 if (!PyUnicode_Check(unicode)) {
2568 PyErr_BadArgument();
2569 return NULL;
2570 }
2571 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2572 PyUnicode_GET_SIZE(unicode),
2573 NULL,
2574 0);
2575}
2576
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577/* --- UTF-16 Codec ------------------------------------------------------- */
2578
Tim Peters772747b2001-08-09 22:21:55 +00002579PyObject *
2580PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002582 const char *errors,
2583 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584{
Walter Dörwald69652032004-09-07 20:24:22 +00002585 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2586}
2587
2588PyObject *
2589PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002590 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002591 const char *errors,
2592 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002593 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002596 Py_ssize_t startinpos;
2597 Py_ssize_t endinpos;
2598 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 PyUnicodeObject *unicode;
2600 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002601 const unsigned char *q, *e;
2602 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002603 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002604 /* Offsets from q for retrieving byte pairs in the right order. */
2605#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2606 int ihi = 1, ilo = 0;
2607#else
2608 int ihi = 0, ilo = 1;
2609#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 PyObject *errorHandler = NULL;
2611 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
2613 /* Note: size will always be longer than the resulting Unicode
2614 character count */
2615 unicode = _PyUnicode_New(size);
2616 if (!unicode)
2617 return NULL;
2618 if (size == 0)
2619 return (PyObject *)unicode;
2620
2621 /* Unpack UTF-16 encoded data */
2622 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002623 q = (unsigned char *)s;
2624 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625
2626 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002627 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002629 /* Check for BOM marks (U+FEFF) in the input and adjust current
2630 byte order setting accordingly. In native mode, the leading BOM
2631 mark is skipped, in all other modes, it is copied to the output
2632 stream as-is (giving a ZWNBSP character). */
2633 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002634 if (size >= 2) {
2635 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002636#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002637 if (bom == 0xFEFF) {
2638 q += 2;
2639 bo = -1;
2640 }
2641 else if (bom == 0xFFFE) {
2642 q += 2;
2643 bo = 1;
2644 }
Tim Petersced69f82003-09-16 20:30:58 +00002645#else
Walter Dörwald69652032004-09-07 20:24:22 +00002646 if (bom == 0xFEFF) {
2647 q += 2;
2648 bo = 1;
2649 }
2650 else if (bom == 0xFFFE) {
2651 q += 2;
2652 bo = -1;
2653 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002654#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002655 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657
Tim Peters772747b2001-08-09 22:21:55 +00002658 if (bo == -1) {
2659 /* force LE */
2660 ihi = 1;
2661 ilo = 0;
2662 }
2663 else if (bo == 1) {
2664 /* force BE */
2665 ihi = 0;
2666 ilo = 1;
2667 }
2668
2669 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002671 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002673 if (consumed)
2674 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 errmsg = "truncated data";
2676 startinpos = ((const char *)q)-starts;
2677 endinpos = ((const char *)e)-starts;
2678 goto utf16Error;
2679 /* The remaining input chars are ignored if the callback
2680 chooses to skip the input */
2681 }
2682 ch = (q[ihi] << 8) | q[ilo];
2683
Tim Peters772747b2001-08-09 22:21:55 +00002684 q += 2;
2685
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 if (ch < 0xD800 || ch > 0xDFFF) {
2687 *p++ = ch;
2688 continue;
2689 }
2690
2691 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002692 if (q >= e) {
2693 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 startinpos = (((const char *)q)-2)-starts;
2695 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002696 goto utf16Error;
2697 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002698 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002699 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2700 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002701 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002702#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002703 *p++ = ch;
2704 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#else
2706 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002707#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002709 }
2710 else {
2711 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 startinpos = (((const char *)q)-4)-starts;
2713 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002714 goto utf16Error;
2715 }
2716
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002718 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 startinpos = (((const char *)q)-2)-starts;
2720 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002721 /* Fall through to report the error */
2722
2723 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 outpos = p-PyUnicode_AS_UNICODE(unicode);
2725 if (unicode_decode_call_errorhandler(
2726 errors, &errorHandler,
2727 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002728 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002729 &unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 }
2732
2733 if (byteorder)
2734 *byteorder = bo;
2735
Walter Dörwald69652032004-09-07 20:24:22 +00002736 if (consumed)
2737 *consumed = (const char *)q-starts;
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002740 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 goto onError;
2742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 return (PyObject *)unicode;
2746
2747onError:
2748 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 return NULL;
2752}
2753
Tim Peters772747b2001-08-09 22:21:55 +00002754PyObject *
2755PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002756 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002757 const char *errors,
2758 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002760 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002761 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002762 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002763#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002764 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002765#else
2766 const int pairs = 0;
2767#endif
Tim Peters772747b2001-08-09 22:21:55 +00002768 /* Offsets from p for storing byte pairs in the right order. */
2769#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2770 int ihi = 1, ilo = 0;
2771#else
2772 int ihi = 0, ilo = 1;
2773#endif
2774
2775#define STORECHAR(CH) \
2776 do { \
2777 p[ihi] = ((CH) >> 8) & 0xff; \
2778 p[ilo] = (CH) & 0xff; \
2779 p += 2; \
2780 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002782#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002783 for (i = pairs = 0; i < size; i++)
2784 if (s[i] >= 0x10000)
2785 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002786#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002787 /* 2 * (size + pairs + (byteorder == 0)) */
2788 if (size > PY_SSIZE_T_MAX ||
2789 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2790 return PyErr_NoMemory();
2791 nsize = size + pairs + (byteorder == 0);
2792 bytesize = nsize * 2;
2793 if (bytesize / 2 != nsize)
2794 return PyErr_NoMemory();
2795 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 if (v == NULL)
2797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Christian Heimes9c4756e2008-05-26 13:22:05 +00002799 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002801 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002802 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002803 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002804
2805 if (byteorder == -1) {
2806 /* force LE */
2807 ihi = 1;
2808 ilo = 0;
2809 }
2810 else if (byteorder == 1) {
2811 /* force BE */
2812 ihi = 0;
2813 ilo = 1;
2814 }
2815
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002816 while (size-- > 0) {
2817 Py_UNICODE ch = *s++;
2818 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002819#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002820 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002821 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2822 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002824#endif
Tim Peters772747b2001-08-09 22:21:55 +00002825 STORECHAR(ch);
2826 if (ch2)
2827 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002828 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829
2830 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002831 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002832 Py_DECREF(v);
2833 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002834#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835}
2836
2837PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2838{
2839 if (!PyUnicode_Check(unicode)) {
2840 PyErr_BadArgument();
2841 return NULL;
2842 }
2843 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2844 PyUnicode_GET_SIZE(unicode),
2845 NULL,
2846 0);
2847}
2848
2849/* --- Unicode Escape Codec ----------------------------------------------- */
2850
Fredrik Lundh06d12682001-01-24 07:59:11 +00002851static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002852
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 const char *errors)
2856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 Py_ssize_t startinpos;
2859 Py_ssize_t endinpos;
2860 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002865 char* message;
2866 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 PyObject *errorHandler = NULL;
2868 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002869
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 /* Escaped strings will always be longer than the resulting
2871 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 length after conversion to the true value.
2873 (but if the error callback returns a long replacement string
2874 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 v = _PyUnicode_New(size);
2876 if (v == NULL)
2877 goto onError;
2878 if (size == 0)
2879 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 while (s < end) {
2885 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002886 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888
2889 /* Non-escape characters are interpreted as Unicode ordinals */
2890 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002891 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 continue;
2893 }
2894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 /* \ - Escapes */
2897 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002898 c = *s++;
2899 if (s > end)
2900 c = '\0'; /* Invalid after \ */
2901 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902
2903 /* \x escapes */
2904 case '\n': break;
2905 case '\\': *p++ = '\\'; break;
2906 case '\'': *p++ = '\''; break;
2907 case '\"': *p++ = '\"'; break;
2908 case 'b': *p++ = '\b'; break;
2909 case 'f': *p++ = '\014'; break; /* FF */
2910 case 't': *p++ = '\t'; break;
2911 case 'n': *p++ = '\n'; break;
2912 case 'r': *p++ = '\r'; break;
2913 case 'v': *p++ = '\013'; break; /* VT */
2914 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2915
2916 /* \OOO (octal) escapes */
2917 case '0': case '1': case '2': case '3':
2918 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002919 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002920 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002921 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002922 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002923 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002925 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 break;
2927
Fredrik Lundhccc74732001-02-18 22:13:49 +00002928 /* hex escapes */
2929 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002931 digits = 2;
2932 message = "truncated \\xXX escape";
2933 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002937 digits = 4;
2938 message = "truncated \\uXXXX escape";
2939 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940
Fredrik Lundhccc74732001-02-18 22:13:49 +00002941 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002942 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002943 digits = 8;
2944 message = "truncated \\UXXXXXXXX escape";
2945 hexescape:
2946 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 outpos = p-PyUnicode_AS_UNICODE(v);
2948 if (s+digits>end) {
2949 endinpos = size;
2950 if (unicode_decode_call_errorhandler(
2951 errors, &errorHandler,
2952 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002953 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002954 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 goto onError;
2956 goto nextByte;
2957 }
2958 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002959 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002960 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 endinpos = (s+i+1)-starts;
2962 if (unicode_decode_call_errorhandler(
2963 errors, &errorHandler,
2964 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002965 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002966 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002969 }
2970 chr = (chr<<4) & ~0xF;
2971 if (c >= '0' && c <= '9')
2972 chr += c - '0';
2973 else if (c >= 'a' && c <= 'f')
2974 chr += 10 + c - 'a';
2975 else
2976 chr += 10 + c - 'A';
2977 }
2978 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002979 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 /* _decoding_error will have already written into the
2981 target buffer. */
2982 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002983 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002984 /* when we get here, chr is a 32-bit unicode character */
2985 if (chr <= 0xffff)
2986 /* UCS-2 character */
2987 *p++ = (Py_UNICODE) chr;
2988 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002989 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002990 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002991#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002992 *p++ = chr;
2993#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002994 chr -= 0x10000L;
2995 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002996 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002997#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002998 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 endinpos = s-starts;
3000 outpos = p-PyUnicode_AS_UNICODE(v);
3001 if (unicode_decode_call_errorhandler(
3002 errors, &errorHandler,
3003 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003004 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003005 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003006 goto onError;
3007 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003008 break;
3009
3010 /* \N{name} */
3011 case 'N':
3012 message = "malformed \\N character escape";
3013 if (ucnhash_CAPI == NULL) {
3014 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003015 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003016 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003017 if (m == NULL)
3018 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003019 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003020 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003021 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003022 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003023 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003024 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003025 if (ucnhash_CAPI == NULL)
3026 goto ucnhashError;
3027 }
3028 if (*s == '{') {
3029 const char *start = s+1;
3030 /* look for the closing brace */
3031 while (*s != '}' && s < end)
3032 s++;
3033 if (s > start && s < end && *s == '}') {
3034 /* found a name. look it up in the unicode database */
3035 message = "unknown Unicode character name";
3036 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003037 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003038 goto store;
3039 }
3040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 endinpos = s-starts;
3042 outpos = p-PyUnicode_AS_UNICODE(v);
3043 if (unicode_decode_call_errorhandler(
3044 errors, &errorHandler,
3045 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003046 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003047 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003048 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003049 break;
3050
3051 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003052 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 message = "\\ at end of string";
3054 s--;
3055 endinpos = s-starts;
3056 outpos = p-PyUnicode_AS_UNICODE(v);
3057 if (unicode_decode_call_errorhandler(
3058 errors, &errorHandler,
3059 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003060 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003061 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003062 goto onError;
3063 }
3064 else {
3065 *p++ = '\\';
3066 *p++ = (unsigned char)s[-1];
3067 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003068 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 nextByte:
3071 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003073 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003075 Py_XDECREF(errorHandler);
3076 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003078
Fredrik Lundhccc74732001-02-18 22:13:49 +00003079ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003080 PyErr_SetString(
3081 PyExc_UnicodeError,
3082 "\\N escapes not supported (can't load unicodedata module)"
3083 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003084 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 Py_XDECREF(errorHandler);
3086 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003087 return NULL;
3088
Fredrik Lundhccc74732001-02-18 22:13:49 +00003089onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 Py_XDECREF(errorHandler);
3092 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 return NULL;
3094}
3095
3096/* Return a Unicode-Escape string version of the Unicode object.
3097
3098 If quotes is true, the string is enclosed in u"" or u'' quotes as
3099 appropriate.
3100
3101*/
3102
Thomas Wouters477c8d52006-05-27 19:21:47 +00003103Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3104 Py_ssize_t size,
3105 Py_UNICODE ch)
3106{
3107 /* like wcschr, but doesn't stop at NULL characters */
3108
3109 while (size-- > 0) {
3110 if (*s == ch)
3111 return s;
3112 s++;
3113 }
3114
3115 return NULL;
3116}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003117
Walter Dörwald79e913e2007-05-12 11:08:06 +00003118static const char *hexdigits = "0123456789abcdef";
3119
3120PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3121 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003123 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003126#ifdef Py_UNICODE_WIDE
3127 const Py_ssize_t expandsize = 10;
3128#else
3129 const Py_ssize_t expandsize = 6;
3130#endif
3131
Thomas Wouters89f507f2006-12-13 04:49:30 +00003132 /* XXX(nnorwitz): rather than over-allocating, it would be
3133 better to choose a different scheme. Perhaps scan the
3134 first N-chars of the string and allocate based on that size.
3135 */
3136 /* Initial allocation is based on the longest-possible unichr
3137 escape.
3138
3139 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3140 unichr, so in this case it's the longest unichr escape. In
3141 narrow (UTF-16) builds this is five chars per source unichr
3142 since there are two unichrs in the surrogate pair, so in narrow
3143 (UTF-16) builds it's not the longest unichr escape.
3144
3145 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3146 so in the narrow (UTF-16) build case it's the longest unichr
3147 escape.
3148 */
3149
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003150 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3151 return PyErr_NoMemory();
3152
Christian Heimes9c4756e2008-05-26 13:22:05 +00003153 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003154 2
3155 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003156 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (repr == NULL)
3158 return NULL;
3159
Christian Heimes9c4756e2008-05-26 13:22:05 +00003160 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 while (size-- > 0) {
3163 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003164
Walter Dörwald79e913e2007-05-12 11:08:06 +00003165 /* Escape backslashes */
3166 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 *p++ = '\\';
3168 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003169 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003170 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003171
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003172#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003173 /* Map 21-bit characters to '\U00xxxxxx' */
3174 else if (ch >= 0x10000) {
3175 *p++ = '\\';
3176 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003177 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3178 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3179 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3180 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3181 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3182 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3183 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3184 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003185 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003186 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003187#else
3188 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003189 else if (ch >= 0xD800 && ch < 0xDC00) {
3190 Py_UNICODE ch2;
3191 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003192
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003193 ch2 = *s++;
3194 size--;
3195 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3196 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3197 *p++ = '\\';
3198 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003199 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3200 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3201 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3202 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3203 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3204 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3205 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3206 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003207 continue;
3208 }
3209 /* Fall through: isolated surrogates are copied as-is */
3210 s--;
3211 size++;
3212 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003213#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003216 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 *p++ = '\\';
3218 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003219 *p++ = hexdigits[(ch >> 12) & 0x000F];
3220 *p++ = hexdigits[(ch >> 8) & 0x000F];
3221 *p++ = hexdigits[(ch >> 4) & 0x000F];
3222 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003225 /* Map special whitespace to '\t', \n', '\r' */
3226 else if (ch == '\t') {
3227 *p++ = '\\';
3228 *p++ = 't';
3229 }
3230 else if (ch == '\n') {
3231 *p++ = '\\';
3232 *p++ = 'n';
3233 }
3234 else if (ch == '\r') {
3235 *p++ = '\\';
3236 *p++ = 'r';
3237 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003238
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003239 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003240 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003242 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003243 *p++ = hexdigits[(ch >> 4) & 0x000F];
3244 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003245 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003246
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 /* Copy everything else as-is */
3248 else
3249 *p++ = (char) ch;
3250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251
Christian Heimes72b710a2008-05-26 13:28:38 +00003252 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003253 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003254 Py_DECREF(repr);
3255 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256}
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3259{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003260 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 return NULL;
3264 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003265 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3266 PyUnicode_GET_SIZE(unicode));
3267
3268 if (!s)
3269 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003270 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003271 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003272 Py_DECREF(s);
3273 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274}
3275
3276/* --- Raw Unicode Escape Codec ------------------------------------------- */
3277
3278PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003279 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 const char *errors)
3281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003283 Py_ssize_t startinpos;
3284 Py_ssize_t endinpos;
3285 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 const char *end;
3289 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 PyObject *errorHandler = NULL;
3291 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 /* Escaped strings will always be longer than the resulting
3294 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 length after conversion to the true value. (But decoding error
3296 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 v = _PyUnicode_New(size);
3298 if (v == NULL)
3299 goto onError;
3300 if (size == 0)
3301 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 end = s + size;
3304 while (s < end) {
3305 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003306 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003308 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 /* Non-escape characters are interpreted as Unicode ordinals */
3311 if (*s != '\\') {
3312 *p++ = (unsigned char)*s++;
3313 continue;
3314 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316
3317 /* \u-escapes are only interpreted iff the number of leading
3318 backslashes if odd */
3319 bs = s;
3320 for (;s < end;) {
3321 if (*s != '\\')
3322 break;
3323 *p++ = (unsigned char)*s++;
3324 }
3325 if (((s - bs) & 1) == 0 ||
3326 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003327 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 continue;
3329 }
3330 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003331 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 s++;
3333
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003336 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339 endinpos = s-starts;
3340 if (unicode_decode_call_errorhandler(
3341 errors, &errorHandler,
3342 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003343 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003344 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 }
3348 x = (x<<4) & ~0xF;
3349 if (c >= '0' && c <= '9')
3350 x += c - '0';
3351 else if (c >= 'a' && c <= 'f')
3352 x += 10 + c - 'a';
3353 else
3354 x += 10 + c - 'A';
3355 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003356 if (x <= 0xffff)
3357 /* UCS-2 character */
3358 *p++ = (Py_UNICODE) x;
3359 else if (x <= 0x10ffff) {
3360 /* UCS-4 character. Either store directly, or as
3361 surrogate pair. */
3362#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003363 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003364#else
3365 x -= 0x10000L;
3366 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3367 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3368#endif
3369 } else {
3370 endinpos = s-starts;
3371 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003372 if (unicode_decode_call_errorhandler(
3373 errors, &errorHandler,
3374 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003375 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003376 &v, &outpos, &p))
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003377 goto onError;
3378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 nextByte:
3380 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003382 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003383 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 Py_XDECREF(errorHandler);
3385 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003387
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 onError:
3389 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 Py_XDECREF(errorHandler);
3391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 return NULL;
3393}
3394
3395PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003396 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003398 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 char *p;
3400 char *q;
3401
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003402#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003403 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003404#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003405 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003406#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003407
3408 if (size > PY_SSIZE_T_MAX / expandsize)
3409 return PyErr_NoMemory();
3410
3411 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 if (repr == NULL)
3413 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003414 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003415 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
Christian Heimes9c4756e2008-05-26 13:22:05 +00003417 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 while (size-- > 0) {
3419 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003420#ifdef Py_UNICODE_WIDE
3421 /* Map 32-bit characters to '\Uxxxxxxxx' */
3422 if (ch >= 0x10000) {
3423 *p++ = '\\';
3424 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003425 *p++ = hexdigits[(ch >> 28) & 0xf];
3426 *p++ = hexdigits[(ch >> 24) & 0xf];
3427 *p++ = hexdigits[(ch >> 20) & 0xf];
3428 *p++ = hexdigits[(ch >> 16) & 0xf];
3429 *p++ = hexdigits[(ch >> 12) & 0xf];
3430 *p++ = hexdigits[(ch >> 8) & 0xf];
3431 *p++ = hexdigits[(ch >> 4) & 0xf];
3432 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003433 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003434 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003435#else
3436 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3437 if (ch >= 0xD800 && ch < 0xDC00) {
3438 Py_UNICODE ch2;
3439 Py_UCS4 ucs;
3440
3441 ch2 = *s++;
3442 size--;
3443 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3444 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3445 *p++ = '\\';
3446 *p++ = 'U';
3447 *p++ = hexdigits[(ucs >> 28) & 0xf];
3448 *p++ = hexdigits[(ucs >> 24) & 0xf];
3449 *p++ = hexdigits[(ucs >> 20) & 0xf];
3450 *p++ = hexdigits[(ucs >> 16) & 0xf];
3451 *p++ = hexdigits[(ucs >> 12) & 0xf];
3452 *p++ = hexdigits[(ucs >> 8) & 0xf];
3453 *p++ = hexdigits[(ucs >> 4) & 0xf];
3454 *p++ = hexdigits[ucs & 0xf];
3455 continue;
3456 }
3457 /* Fall through: isolated surrogates are copied as-is */
3458 s--;
3459 size++;
3460 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003461#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 /* Map 16-bit characters to '\uxxxx' */
3463 if (ch >= 256) {
3464 *p++ = '\\';
3465 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003466 *p++ = hexdigits[(ch >> 12) & 0xf];
3467 *p++ = hexdigits[(ch >> 8) & 0xf];
3468 *p++ = hexdigits[(ch >> 4) & 0xf];
3469 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 }
3471 /* Copy everything else as-is */
3472 else
3473 *p++ = (char) ch;
3474 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003475 size = p - q;
3476
3477 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003478 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003479 Py_DECREF(repr);
3480 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481}
3482
3483PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3484{
Walter Dörwald711005d2007-05-12 12:03:26 +00003485 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003487 PyErr_BadArgument();
3488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003490 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3491 PyUnicode_GET_SIZE(unicode));
3492
3493 if (!s)
3494 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003495 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003496 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003497 Py_DECREF(s);
3498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499}
3500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003501/* --- Unicode Internal Codec ------------------------------------------- */
3502
3503PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003504 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003505 const char *errors)
3506{
3507 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 Py_ssize_t startinpos;
3509 Py_ssize_t endinpos;
3510 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003511 PyUnicodeObject *v;
3512 Py_UNICODE *p;
3513 const char *end;
3514 const char *reason;
3515 PyObject *errorHandler = NULL;
3516 PyObject *exc = NULL;
3517
Neal Norwitzd43069c2006-01-08 01:12:10 +00003518#ifdef Py_UNICODE_WIDE
3519 Py_UNICODE unimax = PyUnicode_GetMax();
3520#endif
3521
Thomas Wouters89f507f2006-12-13 04:49:30 +00003522 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003523 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3524 if (v == NULL)
3525 goto onError;
3526 if (PyUnicode_GetSize((PyObject *)v) == 0)
3527 return (PyObject *)v;
3528 p = PyUnicode_AS_UNICODE(v);
3529 end = s + size;
3530
3531 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003532 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003533 /* We have to sanity check the raw data, otherwise doom looms for
3534 some malformed UCS-4 data. */
3535 if (
3536 #ifdef Py_UNICODE_WIDE
3537 *p > unimax || *p < 0 ||
3538 #endif
3539 end-s < Py_UNICODE_SIZE
3540 )
3541 {
3542 startinpos = s - starts;
3543 if (end-s < Py_UNICODE_SIZE) {
3544 endinpos = end-starts;
3545 reason = "truncated input";
3546 }
3547 else {
3548 endinpos = s - starts + Py_UNICODE_SIZE;
3549 reason = "illegal code point (> 0x10FFFF)";
3550 }
3551 outpos = p - PyUnicode_AS_UNICODE(v);
3552 if (unicode_decode_call_errorhandler(
3553 errors, &errorHandler,
3554 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003555 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003556 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003557 goto onError;
3558 }
3559 }
3560 else {
3561 p++;
3562 s += Py_UNICODE_SIZE;
3563 }
3564 }
3565
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003566 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003567 goto onError;
3568 Py_XDECREF(errorHandler);
3569 Py_XDECREF(exc);
3570 return (PyObject *)v;
3571
3572 onError:
3573 Py_XDECREF(v);
3574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
3576 return NULL;
3577}
3578
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579/* --- Latin-1 Codec ------------------------------------------------------ */
3580
3581PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 const char *errors)
3584{
3585 PyUnicodeObject *v;
3586 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003587
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003589 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003590 Py_UNICODE r = *(unsigned char*)s;
3591 return PyUnicode_FromUnicode(&r, 1);
3592 }
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 v = _PyUnicode_New(size);
3595 if (v == NULL)
3596 goto onError;
3597 if (size == 0)
3598 return (PyObject *)v;
3599 p = PyUnicode_AS_UNICODE(v);
3600 while (size-- > 0)
3601 *p++ = (unsigned char)*s++;
3602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 onError:
3605 Py_XDECREF(v);
3606 return NULL;
3607}
3608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609/* create or adjust a UnicodeEncodeError */
3610static void make_encode_exception(PyObject **exceptionObject,
3611 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003612 const Py_UNICODE *unicode, Py_ssize_t size,
3613 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 if (*exceptionObject == NULL) {
3617 *exceptionObject = PyUnicodeEncodeError_Create(
3618 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 }
3620 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3622 goto onError;
3623 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3624 goto onError;
3625 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3626 goto onError;
3627 return;
3628 onError:
3629 Py_DECREF(*exceptionObject);
3630 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 }
3632}
3633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634/* raises a UnicodeEncodeError */
3635static void raise_encode_exception(PyObject **exceptionObject,
3636 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003637 const Py_UNICODE *unicode, Py_ssize_t size,
3638 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 const char *reason)
3640{
3641 make_encode_exception(exceptionObject,
3642 encoding, unicode, size, startpos, endpos, reason);
3643 if (*exceptionObject != NULL)
3644 PyCodec_StrictErrors(*exceptionObject);
3645}
3646
3647/* error handling callback helper:
3648 build arguments, call the callback and check the arguments,
3649 put the result into newpos and return the replacement string, which
3650 has to be freed by the caller */
3651static PyObject *unicode_encode_call_errorhandler(const char *errors,
3652 PyObject **errorHandler,
3653 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003654 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3655 Py_ssize_t startpos, Py_ssize_t endpos,
3656 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003658 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659
3660 PyObject *restuple;
3661 PyObject *resunicode;
3662
3663 if (*errorHandler == NULL) {
3664 *errorHandler = PyCodec_LookupError(errors);
3665 if (*errorHandler == NULL)
3666 return NULL;
3667 }
3668
3669 make_encode_exception(exceptionObject,
3670 encoding, unicode, size, startpos, endpos, reason);
3671 if (*exceptionObject == NULL)
3672 return NULL;
3673
3674 restuple = PyObject_CallFunctionObjArgs(
3675 *errorHandler, *exceptionObject, NULL);
3676 if (restuple == NULL)
3677 return NULL;
3678 if (!PyTuple_Check(restuple)) {
3679 PyErr_Format(PyExc_TypeError, &argparse[4]);
3680 Py_DECREF(restuple);
3681 return NULL;
3682 }
3683 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3684 &resunicode, newpos)) {
3685 Py_DECREF(restuple);
3686 return NULL;
3687 }
3688 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003689 *newpos = size+*newpos;
3690 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003691 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003692 Py_DECREF(restuple);
3693 return NULL;
3694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 Py_INCREF(resunicode);
3696 Py_DECREF(restuple);
3697 return resunicode;
3698}
3699
3700static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003701 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 const char *errors,
3703 int limit)
3704{
3705 /* output object */
3706 PyObject *res;
3707 /* pointers to the beginning and end+1 of input */
3708 const Py_UNICODE *startp = p;
3709 const Py_UNICODE *endp = p + size;
3710 /* pointer to the beginning of the unencodable characters */
3711 /* const Py_UNICODE *badp = NULL; */
3712 /* pointer into the output */
3713 char *str;
3714 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003715 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003716 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3717 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 PyObject *errorHandler = NULL;
3719 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003720 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 /* the following variable is used for caching string comparisons
3722 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3723 int known_errorHandler = -1;
3724
3725 /* allocate enough for a simple encoding without
3726 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003727 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003728 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003729 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003731 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003732 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 ressize = size;
3734
3735 while (p<endp) {
3736 Py_UNICODE c = *p;
3737
3738 /* can we encode this? */
3739 if (c<limit) {
3740 /* no overflow check, because we know that the space is enough */
3741 *str++ = (char)c;
3742 ++p;
3743 }
3744 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t unicodepos = p-startp;
3746 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003748 Py_ssize_t repsize;
3749 Py_ssize_t newpos;
3750 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 Py_UNICODE *uni2;
3752 /* startpos for collecting unencodable chars */
3753 const Py_UNICODE *collstart = p;
3754 const Py_UNICODE *collend = p;
3755 /* find all unecodable characters */
3756 while ((collend < endp) && ((*collend)>=limit))
3757 ++collend;
3758 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3759 if (known_errorHandler==-1) {
3760 if ((errors==NULL) || (!strcmp(errors, "strict")))
3761 known_errorHandler = 1;
3762 else if (!strcmp(errors, "replace"))
3763 known_errorHandler = 2;
3764 else if (!strcmp(errors, "ignore"))
3765 known_errorHandler = 3;
3766 else if (!strcmp(errors, "xmlcharrefreplace"))
3767 known_errorHandler = 4;
3768 else
3769 known_errorHandler = 0;
3770 }
3771 switch (known_errorHandler) {
3772 case 1: /* strict */
3773 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3774 goto onError;
3775 case 2: /* replace */
3776 while (collstart++<collend)
3777 *str++ = '?'; /* fall through */
3778 case 3: /* ignore */
3779 p = collend;
3780 break;
3781 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003782 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 /* determine replacement size (temporarily (mis)uses p) */
3784 for (p = collstart, repsize = 0; p < collend; ++p) {
3785 if (*p<10)
3786 repsize += 2+1+1;
3787 else if (*p<100)
3788 repsize += 2+2+1;
3789 else if (*p<1000)
3790 repsize += 2+3+1;
3791 else if (*p<10000)
3792 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003793#ifndef Py_UNICODE_WIDE
3794 else
3795 repsize += 2+5+1;
3796#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 else if (*p<100000)
3798 repsize += 2+5+1;
3799 else if (*p<1000000)
3800 repsize += 2+6+1;
3801 else
3802 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003803#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 }
3805 requiredsize = respos+repsize+(endp-collend);
3806 if (requiredsize > ressize) {
3807 if (requiredsize<2*ressize)
3808 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003809 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003811 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 ressize = requiredsize;
3813 }
3814 /* generate replacement (temporarily (mis)uses p) */
3815 for (p = collstart; p < collend; ++p) {
3816 str += sprintf(str, "&#%d;", (int)*p);
3817 }
3818 p = collend;
3819 break;
3820 default:
3821 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3822 encoding, reason, startp, size, &exc,
3823 collstart-startp, collend-startp, &newpos);
3824 if (repunicode == NULL)
3825 goto onError;
3826 /* need more space? (at least enough for what we
3827 have+the replacement+the rest of the string, so
3828 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003829 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 repsize = PyUnicode_GET_SIZE(repunicode);
3831 requiredsize = respos+repsize+(endp-collend);
3832 if (requiredsize > ressize) {
3833 if (requiredsize<2*ressize)
3834 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003835 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 Py_DECREF(repunicode);
3837 goto onError;
3838 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003839 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 ressize = requiredsize;
3841 }
3842 /* check if there is anything unencodable in the replacement
3843 and copy it to the output */
3844 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3845 c = *uni2;
3846 if (c >= limit) {
3847 raise_encode_exception(&exc, encoding, startp, size,
3848 unicodepos, unicodepos+1, reason);
3849 Py_DECREF(repunicode);
3850 goto onError;
3851 }
3852 *str = (char)c;
3853 }
3854 p = startp + newpos;
3855 Py_DECREF(repunicode);
3856 }
3857 }
3858 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003859 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003860 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003861 onError:
3862 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 Py_XDECREF(errorHandler);
3864 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003865 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866}
3867
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003869 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 const char *errors)
3871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873}
3874
3875PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3876{
3877 if (!PyUnicode_Check(unicode)) {
3878 PyErr_BadArgument();
3879 return NULL;
3880 }
3881 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3882 PyUnicode_GET_SIZE(unicode),
3883 NULL);
3884}
3885
3886/* --- 7-bit ASCII Codec -------------------------------------------------- */
3887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 const char *errors)
3891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 PyUnicodeObject *v;
3894 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 Py_ssize_t startinpos;
3896 Py_ssize_t endinpos;
3897 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 const char *e;
3899 PyObject *errorHandler = NULL;
3900 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003903 if (size == 1 && *(unsigned char*)s < 128) {
3904 Py_UNICODE r = *(unsigned char*)s;
3905 return PyUnicode_FromUnicode(&r, 1);
3906 }
Tim Petersced69f82003-09-16 20:30:58 +00003907
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 v = _PyUnicode_New(size);
3909 if (v == NULL)
3910 goto onError;
3911 if (size == 0)
3912 return (PyObject *)v;
3913 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 e = s + size;
3915 while (s < e) {
3916 register unsigned char c = (unsigned char)*s;
3917 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 ++s;
3920 }
3921 else {
3922 startinpos = s-starts;
3923 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003924 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 if (unicode_decode_call_errorhandler(
3926 errors, &errorHandler,
3927 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003928 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003929 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003933 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003934 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003935 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 Py_XDECREF(errorHandler);
3937 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003939
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 onError:
3941 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 Py_XDECREF(errorHandler);
3943 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 return NULL;
3945}
3946
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003948 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 const char *errors)
3950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952}
3953
3954PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3955{
3956 if (!PyUnicode_Check(unicode)) {
3957 PyErr_BadArgument();
3958 return NULL;
3959 }
3960 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3961 PyUnicode_GET_SIZE(unicode),
3962 NULL);
3963}
3964
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003965#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003966
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003967/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003968
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003969#if SIZEOF_INT < SIZEOF_SSIZE_T
3970#define NEED_RETRY
3971#endif
3972
3973/* XXX This code is limited to "true" double-byte encodings, as
3974 a) it assumes an incomplete character consists of a single byte, and
3975 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3976 encodings, see IsDBCSLeadByteEx documentation. */
3977
3978static int is_dbcs_lead_byte(const char *s, int offset)
3979{
3980 const char *curr = s + offset;
3981
3982 if (IsDBCSLeadByte(*curr)) {
3983 const char *prev = CharPrev(s, curr);
3984 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3985 }
3986 return 0;
3987}
3988
3989/*
3990 * Decode MBCS string into unicode object. If 'final' is set, converts
3991 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3992 */
3993static int decode_mbcs(PyUnicodeObject **v,
3994 const char *s, /* MBCS string */
3995 int size, /* sizeof MBCS string */
3996 int final)
3997{
3998 Py_UNICODE *p;
3999 Py_ssize_t n = 0;
4000 int usize = 0;
4001
4002 assert(size >= 0);
4003
4004 /* Skip trailing lead-byte unless 'final' is set */
4005 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4006 --size;
4007
4008 /* First get the size of the result */
4009 if (size > 0) {
4010 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4011 if (usize == 0) {
4012 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4013 return -1;
4014 }
4015 }
4016
4017 if (*v == NULL) {
4018 /* Create unicode object */
4019 *v = _PyUnicode_New(usize);
4020 if (*v == NULL)
4021 return -1;
4022 }
4023 else {
4024 /* Extend unicode object */
4025 n = PyUnicode_GET_SIZE(*v);
4026 if (_PyUnicode_Resize(v, n + usize) < 0)
4027 return -1;
4028 }
4029
4030 /* Do the conversion */
4031 if (size > 0) {
4032 p = PyUnicode_AS_UNICODE(*v) + n;
4033 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4034 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4035 return -1;
4036 }
4037 }
4038
4039 return size;
4040}
4041
4042PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4043 Py_ssize_t size,
4044 const char *errors,
4045 Py_ssize_t *consumed)
4046{
4047 PyUnicodeObject *v = NULL;
4048 int done;
4049
4050 if (consumed)
4051 *consumed = 0;
4052
4053#ifdef NEED_RETRY
4054 retry:
4055 if (size > INT_MAX)
4056 done = decode_mbcs(&v, s, INT_MAX, 0);
4057 else
4058#endif
4059 done = decode_mbcs(&v, s, (int)size, !consumed);
4060
4061 if (done < 0) {
4062 Py_XDECREF(v);
4063 return NULL;
4064 }
4065
4066 if (consumed)
4067 *consumed += done;
4068
4069#ifdef NEED_RETRY
4070 if (size > INT_MAX) {
4071 s += done;
4072 size -= done;
4073 goto retry;
4074 }
4075#endif
4076
4077 return (PyObject *)v;
4078}
4079
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004081 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004082 const char *errors)
4083{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004084 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4085}
4086
4087/*
4088 * Convert unicode into string object (MBCS).
4089 * Returns 0 if succeed, -1 otherwise.
4090 */
4091static int encode_mbcs(PyObject **repr,
4092 const Py_UNICODE *p, /* unicode */
4093 int size) /* size of unicode */
4094{
4095 int mbcssize = 0;
4096 Py_ssize_t n = 0;
4097
4098 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004099
4100 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004101 if (size > 0) {
4102 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4103 if (mbcssize == 0) {
4104 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4105 return -1;
4106 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004107 }
4108
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004109 if (*repr == NULL) {
4110 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004111 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004112 if (*repr == NULL)
4113 return -1;
4114 }
4115 else {
4116 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004117 n = PyBytes_Size(*repr);
4118 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004119 return -1;
4120 }
4121
4122 /* Do the conversion */
4123 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004124 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004125 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4126 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4127 return -1;
4128 }
4129 }
4130
4131 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004132}
4133
4134PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004135 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004136 const char *errors)
4137{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004138 PyObject *repr = NULL;
4139 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004140
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004141#ifdef NEED_RETRY
4142 retry:
4143 if (size > INT_MAX)
4144 ret = encode_mbcs(&repr, p, INT_MAX);
4145 else
4146#endif
4147 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004149 if (ret < 0) {
4150 Py_XDECREF(repr);
4151 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004152 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004153
4154#ifdef NEED_RETRY
4155 if (size > INT_MAX) {
4156 p += INT_MAX;
4157 size -= INT_MAX;
4158 goto retry;
4159 }
4160#endif
4161
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004162 return repr;
4163}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004164
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004165PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4166{
4167 if (!PyUnicode_Check(unicode)) {
4168 PyErr_BadArgument();
4169 return NULL;
4170 }
4171 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4172 PyUnicode_GET_SIZE(unicode),
4173 NULL);
4174}
4175
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004176#undef NEED_RETRY
4177
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004178#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180/* --- Character Mapping Codec -------------------------------------------- */
4181
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004183 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 PyObject *mapping,
4185 const char *errors)
4186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
4190 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 PyUnicodeObject *v;
4193 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 PyObject *errorHandler = NULL;
4196 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004197 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004198 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004199
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 /* Default to Latin-1 */
4201 if (mapping == NULL)
4202 return PyUnicode_DecodeLatin1(s, size, errors);
4203
4204 v = _PyUnicode_New(size);
4205 if (v == NULL)
4206 goto onError;
4207 if (size == 0)
4208 return (PyObject *)v;
4209 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004211 if (PyUnicode_CheckExact(mapping)) {
4212 mapstring = PyUnicode_AS_UNICODE(mapping);
4213 maplen = PyUnicode_GET_SIZE(mapping);
4214 while (s < e) {
4215 unsigned char ch = *s;
4216 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004218 if (ch < maplen)
4219 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004221 if (x == 0xfffe) {
4222 /* undefined mapping */
4223 outpos = p-PyUnicode_AS_UNICODE(v);
4224 startinpos = s-starts;
4225 endinpos = startinpos+1;
4226 if (unicode_decode_call_errorhandler(
4227 errors, &errorHandler,
4228 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004229 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004230 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004232 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004233 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004234 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004235 *p++ = x;
4236 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004238 }
4239 else {
4240 while (s < e) {
4241 unsigned char ch = *s;
4242 PyObject *w, *x;
4243
4244 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004245 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004246 if (w == NULL)
4247 goto onError;
4248 x = PyObject_GetItem(mapping, w);
4249 Py_DECREF(w);
4250 if (x == NULL) {
4251 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4252 /* No mapping found means: mapping is undefined. */
4253 PyErr_Clear();
4254 x = Py_None;
4255 Py_INCREF(x);
4256 } else
4257 goto onError;
4258 }
4259
4260 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004261 if (PyLong_Check(x)) {
4262 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004263 if (value < 0 || value > 65535) {
4264 PyErr_SetString(PyExc_TypeError,
4265 "character mapping must be in range(65536)");
4266 Py_DECREF(x);
4267 goto onError;
4268 }
4269 *p++ = (Py_UNICODE)value;
4270 }
4271 else if (x == Py_None) {
4272 /* undefined mapping */
4273 outpos = p-PyUnicode_AS_UNICODE(v);
4274 startinpos = s-starts;
4275 endinpos = startinpos+1;
4276 if (unicode_decode_call_errorhandler(
4277 errors, &errorHandler,
4278 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004279 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004280 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004281 Py_DECREF(x);
4282 goto onError;
4283 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004284 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004285 continue;
4286 }
4287 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004288 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004289
4290 if (targetsize == 1)
4291 /* 1-1 mapping */
4292 *p++ = *PyUnicode_AS_UNICODE(x);
4293
4294 else if (targetsize > 1) {
4295 /* 1-n mapping */
4296 if (targetsize > extrachars) {
4297 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4299 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004300 (targetsize << 2);
4301 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004302 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004303 if (_PyUnicode_Resize(&v,
4304 PyUnicode_GET_SIZE(v) + needed) < 0) {
4305 Py_DECREF(x);
4306 goto onError;
4307 }
4308 p = PyUnicode_AS_UNICODE(v) + oldpos;
4309 }
4310 Py_UNICODE_COPY(p,
4311 PyUnicode_AS_UNICODE(x),
4312 targetsize);
4313 p += targetsize;
4314 extrachars -= targetsize;
4315 }
4316 /* 1-0 mapping: skip the character */
4317 }
4318 else {
4319 /* wrong return value */
4320 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004321 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004322 Py_DECREF(x);
4323 goto onError;
4324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004326 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 }
4329 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004330 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 Py_XDECREF(errorHandler);
4333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337 Py_XDECREF(errorHandler);
4338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 Py_XDECREF(v);
4340 return NULL;
4341}
4342
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004343/* Charmap encoding: the lookup table */
4344
4345struct encoding_map{
4346 PyObject_HEAD
4347 unsigned char level1[32];
4348 int count2, count3;
4349 unsigned char level23[1];
4350};
4351
4352static PyObject*
4353encoding_map_size(PyObject *obj, PyObject* args)
4354{
4355 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004356 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004357 128*map->count3);
4358}
4359
4360static PyMethodDef encoding_map_methods[] = {
4361 {"size", encoding_map_size, METH_NOARGS,
4362 PyDoc_STR("Return the size (in bytes) of this object") },
4363 { 0 }
4364};
4365
4366static void
4367encoding_map_dealloc(PyObject* o)
4368{
4369 PyObject_FREE(o);
4370}
4371
4372static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004373 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 "EncodingMap", /*tp_name*/
4375 sizeof(struct encoding_map), /*tp_basicsize*/
4376 0, /*tp_itemsize*/
4377 /* methods */
4378 encoding_map_dealloc, /*tp_dealloc*/
4379 0, /*tp_print*/
4380 0, /*tp_getattr*/
4381 0, /*tp_setattr*/
4382 0, /*tp_compare*/
4383 0, /*tp_repr*/
4384 0, /*tp_as_number*/
4385 0, /*tp_as_sequence*/
4386 0, /*tp_as_mapping*/
4387 0, /*tp_hash*/
4388 0, /*tp_call*/
4389 0, /*tp_str*/
4390 0, /*tp_getattro*/
4391 0, /*tp_setattro*/
4392 0, /*tp_as_buffer*/
4393 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4394 0, /*tp_doc*/
4395 0, /*tp_traverse*/
4396 0, /*tp_clear*/
4397 0, /*tp_richcompare*/
4398 0, /*tp_weaklistoffset*/
4399 0, /*tp_iter*/
4400 0, /*tp_iternext*/
4401 encoding_map_methods, /*tp_methods*/
4402 0, /*tp_members*/
4403 0, /*tp_getset*/
4404 0, /*tp_base*/
4405 0, /*tp_dict*/
4406 0, /*tp_descr_get*/
4407 0, /*tp_descr_set*/
4408 0, /*tp_dictoffset*/
4409 0, /*tp_init*/
4410 0, /*tp_alloc*/
4411 0, /*tp_new*/
4412 0, /*tp_free*/
4413 0, /*tp_is_gc*/
4414};
4415
4416PyObject*
4417PyUnicode_BuildEncodingMap(PyObject* string)
4418{
4419 Py_UNICODE *decode;
4420 PyObject *result;
4421 struct encoding_map *mresult;
4422 int i;
4423 int need_dict = 0;
4424 unsigned char level1[32];
4425 unsigned char level2[512];
4426 unsigned char *mlevel1, *mlevel2, *mlevel3;
4427 int count2 = 0, count3 = 0;
4428
4429 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4430 PyErr_BadArgument();
4431 return NULL;
4432 }
4433 decode = PyUnicode_AS_UNICODE(string);
4434 memset(level1, 0xFF, sizeof level1);
4435 memset(level2, 0xFF, sizeof level2);
4436
4437 /* If there isn't a one-to-one mapping of NULL to \0,
4438 or if there are non-BMP characters, we need to use
4439 a mapping dictionary. */
4440 if (decode[0] != 0)
4441 need_dict = 1;
4442 for (i = 1; i < 256; i++) {
4443 int l1, l2;
4444 if (decode[i] == 0
4445 #ifdef Py_UNICODE_WIDE
4446 || decode[i] > 0xFFFF
4447 #endif
4448 ) {
4449 need_dict = 1;
4450 break;
4451 }
4452 if (decode[i] == 0xFFFE)
4453 /* unmapped character */
4454 continue;
4455 l1 = decode[i] >> 11;
4456 l2 = decode[i] >> 7;
4457 if (level1[l1] == 0xFF)
4458 level1[l1] = count2++;
4459 if (level2[l2] == 0xFF)
4460 level2[l2] = count3++;
4461 }
4462
4463 if (count2 >= 0xFF || count3 >= 0xFF)
4464 need_dict = 1;
4465
4466 if (need_dict) {
4467 PyObject *result = PyDict_New();
4468 PyObject *key, *value;
4469 if (!result)
4470 return NULL;
4471 for (i = 0; i < 256; i++) {
4472 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004473 key = PyLong_FromLong(decode[i]);
4474 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004475 if (!key || !value)
4476 goto failed1;
4477 if (PyDict_SetItem(result, key, value) == -1)
4478 goto failed1;
4479 Py_DECREF(key);
4480 Py_DECREF(value);
4481 }
4482 return result;
4483 failed1:
4484 Py_XDECREF(key);
4485 Py_XDECREF(value);
4486 Py_DECREF(result);
4487 return NULL;
4488 }
4489
4490 /* Create a three-level trie */
4491 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4492 16*count2 + 128*count3 - 1);
4493 if (!result)
4494 return PyErr_NoMemory();
4495 PyObject_Init(result, &EncodingMapType);
4496 mresult = (struct encoding_map*)result;
4497 mresult->count2 = count2;
4498 mresult->count3 = count3;
4499 mlevel1 = mresult->level1;
4500 mlevel2 = mresult->level23;
4501 mlevel3 = mresult->level23 + 16*count2;
4502 memcpy(mlevel1, level1, 32);
4503 memset(mlevel2, 0xFF, 16*count2);
4504 memset(mlevel3, 0, 128*count3);
4505 count3 = 0;
4506 for (i = 1; i < 256; i++) {
4507 int o1, o2, o3, i2, i3;
4508 if (decode[i] == 0xFFFE)
4509 /* unmapped character */
4510 continue;
4511 o1 = decode[i]>>11;
4512 o2 = (decode[i]>>7) & 0xF;
4513 i2 = 16*mlevel1[o1] + o2;
4514 if (mlevel2[i2] == 0xFF)
4515 mlevel2[i2] = count3++;
4516 o3 = decode[i] & 0x7F;
4517 i3 = 128*mlevel2[i2] + o3;
4518 mlevel3[i3] = i;
4519 }
4520 return result;
4521}
4522
4523static int
4524encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4525{
4526 struct encoding_map *map = (struct encoding_map*)mapping;
4527 int l1 = c>>11;
4528 int l2 = (c>>7) & 0xF;
4529 int l3 = c & 0x7F;
4530 int i;
4531
4532#ifdef Py_UNICODE_WIDE
4533 if (c > 0xFFFF) {
4534 return -1;
4535 }
4536#endif
4537 if (c == 0)
4538 return 0;
4539 /* level 1*/
4540 i = map->level1[l1];
4541 if (i == 0xFF) {
4542 return -1;
4543 }
4544 /* level 2*/
4545 i = map->level23[16*i+l2];
4546 if (i == 0xFF) {
4547 return -1;
4548 }
4549 /* level 3 */
4550 i = map->level23[16*map->count2 + 128*i + l3];
4551 if (i == 0) {
4552 return -1;
4553 }
4554 return i;
4555}
4556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557/* Lookup the character ch in the mapping. If the character
4558 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004559 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561{
Christian Heimes217cfd12007-12-02 14:31:20 +00004562 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 PyObject *x;
4564
4565 if (w == NULL)
4566 return NULL;
4567 x = PyObject_GetItem(mapping, w);
4568 Py_DECREF(w);
4569 if (x == NULL) {
4570 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4571 /* No mapping found means: mapping is undefined. */
4572 PyErr_Clear();
4573 x = Py_None;
4574 Py_INCREF(x);
4575 return x;
4576 } else
4577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004579 else if (x == Py_None)
4580 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004581 else if (PyLong_Check(x)) {
4582 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (value < 0 || value > 255) {
4584 PyErr_SetString(PyExc_TypeError,
4585 "character mapping must be in range(256)");
4586 Py_DECREF(x);
4587 return NULL;
4588 }
4589 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004591 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004595 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004596 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004597 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 Py_DECREF(x);
4599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 }
4601}
4602
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004603static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004604charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004605{
Christian Heimes72b710a2008-05-26 13:28:38 +00004606 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004607 /* exponentially overallocate to minimize reallocations */
4608 if (requiredsize < 2*outsize)
4609 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004610 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004611 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004612 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004613}
4614
4615typedef enum charmapencode_result {
4616 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4617}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004619 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 space is available. Return a new reference to the object that
4621 was put in the output buffer, or Py_None, if the mapping was undefined
4622 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004623 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004625charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004626 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004628 PyObject *rep;
4629 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004630 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631
Christian Heimes90aa7642007-12-19 02:45:37 +00004632 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004633 int res = encoding_map_lookup(c, mapping);
4634 Py_ssize_t requiredsize = *outpos+1;
4635 if (res == -1)
4636 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004637 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004638 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004639 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004640 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004641 outstart[(*outpos)++] = (char)res;
4642 return enc_SUCCESS;
4643 }
4644
4645 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004647 return enc_EXCEPTION;
4648 else if (rep==Py_None) {
4649 Py_DECREF(rep);
4650 return enc_FAILED;
4651 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004652 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004654 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004655 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004657 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004659 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004660 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 }
4662 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004663 const char *repchars = PyBytes_AS_STRING(rep);
4664 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004665 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004666 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004667 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004669 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004671 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 memcpy(outstart + *outpos, repchars, repsize);
4673 *outpos += repsize;
4674 }
4675 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004676 Py_DECREF(rep);
4677 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678}
4679
4680/* handle an error in PyUnicode_EncodeCharmap
4681 Return 0 on success, -1 on error */
4682static
4683int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004684 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004686 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004687 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688{
4689 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 Py_ssize_t repsize;
4691 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 Py_UNICODE *uni2;
4693 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004694 Py_ssize_t collstartpos = *inpos;
4695 Py_ssize_t collendpos = *inpos+1;
4696 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 char *encoding = "charmap";
4698 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004699 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 /* find all unencodable characters */
4702 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004703 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004704 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004705 int res = encoding_map_lookup(p[collendpos], mapping);
4706 if (res != -1)
4707 break;
4708 ++collendpos;
4709 continue;
4710 }
4711
4712 rep = charmapencode_lookup(p[collendpos], mapping);
4713 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004715 else if (rep!=Py_None) {
4716 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 break;
4718 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004719 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 ++collendpos;
4721 }
4722 /* cache callback name lookup
4723 * (if not done yet, i.e. it's the first error) */
4724 if (*known_errorHandler==-1) {
4725 if ((errors==NULL) || (!strcmp(errors, "strict")))
4726 *known_errorHandler = 1;
4727 else if (!strcmp(errors, "replace"))
4728 *known_errorHandler = 2;
4729 else if (!strcmp(errors, "ignore"))
4730 *known_errorHandler = 3;
4731 else if (!strcmp(errors, "xmlcharrefreplace"))
4732 *known_errorHandler = 4;
4733 else
4734 *known_errorHandler = 0;
4735 }
4736 switch (*known_errorHandler) {
4737 case 1: /* strict */
4738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739 return -1;
4740 case 2: /* replace */
4741 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4742 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004743 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 return -1;
4745 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004746 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4748 return -1;
4749 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 }
4751 /* fall through */
4752 case 3: /* ignore */
4753 *inpos = collendpos;
4754 break;
4755 case 4: /* xmlcharrefreplace */
4756 /* generate replacement (temporarily (mis)uses p) */
4757 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4758 char buffer[2+29+1+1];
4759 char *cp;
4760 sprintf(buffer, "&#%d;", (int)p[collpos]);
4761 for (cp = buffer; *cp; ++cp) {
4762 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004763 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004765 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4767 return -1;
4768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 }
4770 }
4771 *inpos = collendpos;
4772 break;
4773 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004774 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 encoding, reason, p, size, exceptionObject,
4776 collstartpos, collendpos, &newpos);
4777 if (repunicode == NULL)
4778 return -1;
4779 /* generate replacement */
4780 repsize = PyUnicode_GET_SIZE(repunicode);
4781 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4782 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004783 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 return -1;
4785 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004786 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4789 return -1;
4790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 }
4792 *inpos = newpos;
4793 Py_DECREF(repunicode);
4794 }
4795 return 0;
4796}
4797
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 PyObject *mapping,
4801 const char *errors)
4802{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 /* output object */
4804 PyObject *res = NULL;
4805 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 PyObject *errorHandler = NULL;
4810 PyObject *exc = NULL;
4811 /* the following variable is used for caching string comparisons
4812 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4813 * 3=ignore, 4=xmlcharrefreplace */
4814 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
4816 /* Default to Latin-1 */
4817 if (mapping == NULL)
4818 return PyUnicode_EncodeLatin1(p, size, errors);
4819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 /* allocate enough for a simple encoding without
4821 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004822 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 if (res == NULL)
4824 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004825 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 while (inpos<size) {
4829 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004830 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004831 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004833 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 if (charmap_encoding_error(p, size, &inpos, mapping,
4835 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004836 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004837 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004838 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 else
4842 /* done with this character => adjust input position */
4843 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004847 if (respos<PyBytes_GET_SIZE(res))
4848 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_XDECREF(exc);
4851 Py_XDECREF(errorHandler);
4852 return res;
4853
4854 onError:
4855 Py_XDECREF(res);
4856 Py_XDECREF(exc);
4857 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 return NULL;
4859}
4860
4861PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4862 PyObject *mapping)
4863{
4864 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4865 PyErr_BadArgument();
4866 return NULL;
4867 }
4868 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4869 PyUnicode_GET_SIZE(unicode),
4870 mapping,
4871 NULL);
4872}
4873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874/* create or adjust a UnicodeTranslateError */
4875static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004876 const Py_UNICODE *unicode, Py_ssize_t size,
4877 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 if (*exceptionObject == NULL) {
4881 *exceptionObject = PyUnicodeTranslateError_Create(
4882 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 }
4884 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4886 goto onError;
4887 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4888 goto onError;
4889 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4890 goto onError;
4891 return;
4892 onError:
4893 Py_DECREF(*exceptionObject);
4894 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 }
4896}
4897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898/* raises a UnicodeTranslateError */
4899static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 const Py_UNICODE *unicode, Py_ssize_t size,
4901 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 const char *reason)
4903{
4904 make_translate_exception(exceptionObject,
4905 unicode, size, startpos, endpos, reason);
4906 if (*exceptionObject != NULL)
4907 PyCodec_StrictErrors(*exceptionObject);
4908}
4909
4910/* error handling callback helper:
4911 build arguments, call the callback and check the arguments,
4912 put the result into newpos and return the replacement string, which
4913 has to be freed by the caller */
4914static PyObject *unicode_translate_call_errorhandler(const char *errors,
4915 PyObject **errorHandler,
4916 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004917 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4918 Py_ssize_t startpos, Py_ssize_t endpos,
4919 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004921 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004923 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 PyObject *restuple;
4925 PyObject *resunicode;
4926
4927 if (*errorHandler == NULL) {
4928 *errorHandler = PyCodec_LookupError(errors);
4929 if (*errorHandler == NULL)
4930 return NULL;
4931 }
4932
4933 make_translate_exception(exceptionObject,
4934 unicode, size, startpos, endpos, reason);
4935 if (*exceptionObject == NULL)
4936 return NULL;
4937
4938 restuple = PyObject_CallFunctionObjArgs(
4939 *errorHandler, *exceptionObject, NULL);
4940 if (restuple == NULL)
4941 return NULL;
4942 if (!PyTuple_Check(restuple)) {
4943 PyErr_Format(PyExc_TypeError, &argparse[4]);
4944 Py_DECREF(restuple);
4945 return NULL;
4946 }
4947 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004948 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 Py_DECREF(restuple);
4950 return NULL;
4951 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004952 if (i_newpos<0)
4953 *newpos = size+i_newpos;
4954 else
4955 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004956 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004957 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004958 Py_DECREF(restuple);
4959 return NULL;
4960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 Py_INCREF(resunicode);
4962 Py_DECREF(restuple);
4963 return resunicode;
4964}
4965
4966/* Lookup the character ch in the mapping and put the result in result,
4967 which must be decrefed by the caller.
4968 Return 0 on success, -1 on error */
4969static
4970int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4971{
Christian Heimes217cfd12007-12-02 14:31:20 +00004972 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 PyObject *x;
4974
4975 if (w == NULL)
4976 return -1;
4977 x = PyObject_GetItem(mapping, w);
4978 Py_DECREF(w);
4979 if (x == NULL) {
4980 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4981 /* No mapping found means: use 1:1 mapping. */
4982 PyErr_Clear();
4983 *result = NULL;
4984 return 0;
4985 } else
4986 return -1;
4987 }
4988 else if (x == Py_None) {
4989 *result = x;
4990 return 0;
4991 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004992 else if (PyLong_Check(x)) {
4993 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 long max = PyUnicode_GetMax();
4995 if (value < 0 || value > max) {
4996 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004997 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 Py_DECREF(x);
4999 return -1;
5000 }
5001 *result = x;
5002 return 0;
5003 }
5004 else if (PyUnicode_Check(x)) {
5005 *result = x;
5006 return 0;
5007 }
5008 else {
5009 /* wrong return value */
5010 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005011 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005012 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 return -1;
5014 }
5015}
5016/* ensure that *outobj is at least requiredsize characters long,
5017if not reallocate and adjust various state variables.
5018Return 0 on success, -1 on error */
5019static
Walter Dörwald4894c302003-10-24 14:25:28 +00005020int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005021 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005023 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005024 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005026 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005028 if (requiredsize < 2 * oldsize)
5029 requiredsize = 2 * oldsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005030 if (PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 return -1;
5032 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 }
5034 return 0;
5035}
5036/* lookup the character, put the result in the output string and adjust
5037 various state variables. Return a new reference to the object that
5038 was put in the output buffer in *result, or Py_None, if the mapping was
5039 undefined (in which case no character was written).
5040 The called must decref result.
5041 Return 0 on success, -1 on error. */
5042static
Walter Dörwald4894c302003-10-24 14:25:28 +00005043int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005045 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046{
Walter Dörwald4894c302003-10-24 14:25:28 +00005047 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 return -1;
5049 if (*res==NULL) {
5050 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005051 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 }
5053 else if (*res==Py_None)
5054 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005055 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005057 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 }
5059 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005060 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 if (repsize==1) {
5062 /* no overflow check, because we know that the space is enough */
5063 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5064 }
5065 else if (repsize!=0) {
5066 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005067 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005068 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005069 repsize - 1;
5070 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 return -1;
5072 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5073 *outp += repsize;
5074 }
5075 }
5076 else
5077 return -1;
5078 return 0;
5079}
5080
5081PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005082 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 PyObject *mapping,
5084 const char *errors)
5085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 /* output object */
5087 PyObject *res = NULL;
5088 /* pointers to the beginning and end+1 of input */
5089 const Py_UNICODE *startp = p;
5090 const Py_UNICODE *endp = p + size;
5091 /* pointer into the output */
5092 Py_UNICODE *str;
5093 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095 char *reason = "character maps to <undefined>";
5096 PyObject *errorHandler = NULL;
5097 PyObject *exc = NULL;
5098 /* the following variable is used for caching string comparisons
5099 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5100 * 3=ignore, 4=xmlcharrefreplace */
5101 int known_errorHandler = -1;
5102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 if (mapping == NULL) {
5104 PyErr_BadArgument();
5105 return NULL;
5106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107
5108 /* allocate enough for a simple 1:1 translation without
5109 replacements, if we need more, we'll resize */
5110 res = PyUnicode_FromUnicode(NULL, size);
5111 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005112 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 return res;
5115 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 while (p<endp) {
5118 /* try to encode it */
5119 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005120 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005121 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 goto onError;
5123 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005124 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 if (x!=Py_None) /* it worked => adjust input pointer */
5126 ++p;
5127 else { /* untranslatable character */
5128 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005129 Py_ssize_t repsize;
5130 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 Py_UNICODE *uni2;
5132 /* startpos for collecting untranslatable chars */
5133 const Py_UNICODE *collstart = p;
5134 const Py_UNICODE *collend = p+1;
5135 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 /* find all untranslatable characters */
5138 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005139 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 goto onError;
5141 Py_XDECREF(x);
5142 if (x!=Py_None)
5143 break;
5144 ++collend;
5145 }
5146 /* cache callback name lookup
5147 * (if not done yet, i.e. it's the first error) */
5148 if (known_errorHandler==-1) {
5149 if ((errors==NULL) || (!strcmp(errors, "strict")))
5150 known_errorHandler = 1;
5151 else if (!strcmp(errors, "replace"))
5152 known_errorHandler = 2;
5153 else if (!strcmp(errors, "ignore"))
5154 known_errorHandler = 3;
5155 else if (!strcmp(errors, "xmlcharrefreplace"))
5156 known_errorHandler = 4;
5157 else
5158 known_errorHandler = 0;
5159 }
5160 switch (known_errorHandler) {
5161 case 1: /* strict */
5162 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5163 goto onError;
5164 case 2: /* replace */
5165 /* No need to check for space, this is a 1:1 replacement */
5166 for (coll = collstart; coll<collend; ++coll)
5167 *str++ = '?';
5168 /* fall through */
5169 case 3: /* ignore */
5170 p = collend;
5171 break;
5172 case 4: /* xmlcharrefreplace */
5173 /* generate replacement (temporarily (mis)uses p) */
5174 for (p = collstart; p < collend; ++p) {
5175 char buffer[2+29+1+1];
5176 char *cp;
5177 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005178 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5180 goto onError;
5181 for (cp = buffer; *cp; ++cp)
5182 *str++ = *cp;
5183 }
5184 p = collend;
5185 break;
5186 default:
5187 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5188 reason, startp, size, &exc,
5189 collstart-startp, collend-startp, &newpos);
5190 if (repunicode == NULL)
5191 goto onError;
5192 /* generate replacement */
5193 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005194 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5196 Py_DECREF(repunicode);
5197 goto onError;
5198 }
5199 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5200 *str++ = *uni2;
5201 p = startp + newpos;
5202 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 }
5204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 /* Resize if we allocated to much */
5207 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005208 if (respos<PyUnicode_GET_SIZE(res)) {
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005209 if (PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005210 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 }
5212 Py_XDECREF(exc);
5213 Py_XDECREF(errorHandler);
5214 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 onError:
5217 Py_XDECREF(res);
5218 Py_XDECREF(exc);
5219 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 return NULL;
5221}
5222
5223PyObject *PyUnicode_Translate(PyObject *str,
5224 PyObject *mapping,
5225 const char *errors)
5226{
5227 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 str = PyUnicode_FromObject(str);
5230 if (str == NULL)
5231 goto onError;
5232 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5233 PyUnicode_GET_SIZE(str),
5234 mapping,
5235 errors);
5236 Py_DECREF(str);
5237 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 onError:
5240 Py_XDECREF(str);
5241 return NULL;
5242}
Tim Petersced69f82003-09-16 20:30:58 +00005243
Guido van Rossum9e896b32000-04-05 20:11:21 +00005244/* --- Decimal Encoder ---------------------------------------------------- */
5245
5246int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005248 char *output,
5249 const char *errors)
5250{
5251 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
5254 const char *encoding = "decimal";
5255 const char *reason = "invalid decimal Unicode string";
5256 /* the following variable is used for caching string comparisons
5257 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5258 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005259
5260 if (output == NULL) {
5261 PyErr_BadArgument();
5262 return -1;
5263 }
5264
5265 p = s;
5266 end = s + length;
5267 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005269 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271 Py_ssize_t repsize;
5272 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005273 Py_UNICODE *uni2;
5274 Py_UNICODE *collstart;
5275 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005276
Guido van Rossum9e896b32000-04-05 20:11:21 +00005277 if (Py_UNICODE_ISSPACE(ch)) {
5278 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005280 continue;
5281 }
5282 decimal = Py_UNICODE_TODECIMAL(ch);
5283 if (decimal >= 0) {
5284 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005286 continue;
5287 }
Guido van Rossumba477042000-04-06 18:18:10 +00005288 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005289 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005291 continue;
5292 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293 /* All other characters are considered unencodable */
5294 collstart = p;
5295 collend = p+1;
5296 while (collend < end) {
5297 if ((0 < *collend && *collend < 256) ||
5298 !Py_UNICODE_ISSPACE(*collend) ||
5299 Py_UNICODE_TODECIMAL(*collend))
5300 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005301 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 /* cache callback name lookup
5303 * (if not done yet, i.e. it's the first error) */
5304 if (known_errorHandler==-1) {
5305 if ((errors==NULL) || (!strcmp(errors, "strict")))
5306 known_errorHandler = 1;
5307 else if (!strcmp(errors, "replace"))
5308 known_errorHandler = 2;
5309 else if (!strcmp(errors, "ignore"))
5310 known_errorHandler = 3;
5311 else if (!strcmp(errors, "xmlcharrefreplace"))
5312 known_errorHandler = 4;
5313 else
5314 known_errorHandler = 0;
5315 }
5316 switch (known_errorHandler) {
5317 case 1: /* strict */
5318 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5319 goto onError;
5320 case 2: /* replace */
5321 for (p = collstart; p < collend; ++p)
5322 *output++ = '?';
5323 /* fall through */
5324 case 3: /* ignore */
5325 p = collend;
5326 break;
5327 case 4: /* xmlcharrefreplace */
5328 /* generate replacement (temporarily (mis)uses p) */
5329 for (p = collstart; p < collend; ++p)
5330 output += sprintf(output, "&#%d;", (int)*p);
5331 p = collend;
5332 break;
5333 default:
5334 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5335 encoding, reason, s, length, &exc,
5336 collstart-s, collend-s, &newpos);
5337 if (repunicode == NULL)
5338 goto onError;
5339 /* generate replacement */
5340 repsize = PyUnicode_GET_SIZE(repunicode);
5341 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5342 Py_UNICODE ch = *uni2;
5343 if (Py_UNICODE_ISSPACE(ch))
5344 *output++ = ' ';
5345 else {
5346 decimal = Py_UNICODE_TODECIMAL(ch);
5347 if (decimal >= 0)
5348 *output++ = '0' + decimal;
5349 else if (0 < ch && ch < 256)
5350 *output++ = (char)ch;
5351 else {
5352 Py_DECREF(repunicode);
5353 raise_encode_exception(&exc, encoding,
5354 s, length, collstart-s, collend-s, reason);
5355 goto onError;
5356 }
5357 }
5358 }
5359 p = s + newpos;
5360 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005361 }
5362 }
5363 /* 0-terminate the output string */
5364 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 Py_XDECREF(exc);
5366 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005367 return 0;
5368
5369 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 Py_XDECREF(exc);
5371 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005372 return -1;
5373}
5374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375/* --- Helpers ------------------------------------------------------------ */
5376
Eric Smith8c663262007-08-25 02:26:07 +00005377#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005378#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005379#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005380/* Include _ParseTupleFinds from find.h */
5381#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005382#include "stringlib/find.h"
5383#include "stringlib/partition.h"
5384
Eric Smith5807c412008-05-11 21:00:57 +00005385#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5386#include "stringlib/localeutil.h"
5387
Thomas Wouters477c8d52006-05-27 19:21:47 +00005388/* helper macro to fixup start/end slice values */
5389#define FIX_START_END(obj) \
5390 if (start < 0) \
5391 start += (obj)->length; \
5392 if (start < 0) \
5393 start = 0; \
5394 if (end > (obj)->length) \
5395 end = (obj)->length; \
5396 if (end < 0) \
5397 end += (obj)->length; \
5398 if (end < 0) \
5399 end = 0;
5400
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005402 PyObject *substr,
5403 Py_ssize_t start,
5404 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005406 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005407 PyUnicodeObject* str_obj;
5408 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005409
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5411 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005413 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5414 if (!sub_obj) {
5415 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 return -1;
5417 }
Tim Petersced69f82003-09-16 20:30:58 +00005418
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005420
Thomas Wouters477c8d52006-05-27 19:21:47 +00005421 result = stringlib_count(
5422 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5423 );
5424
5425 Py_DECREF(sub_obj);
5426 Py_DECREF(str_obj);
5427
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 return result;
5429}
5430
Martin v. Löwis18e16552006-02-15 17:27:45 +00005431Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005432 PyObject *sub,
5433 Py_ssize_t start,
5434 Py_ssize_t end,
5435 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005438
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005441 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 sub = PyUnicode_FromObject(sub);
5443 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005444 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005445 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
Tim Petersced69f82003-09-16 20:30:58 +00005447
Thomas Wouters477c8d52006-05-27 19:21:47 +00005448 if (direction > 0)
5449 result = stringlib_find_slice(
5450 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5451 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5452 start, end
5453 );
5454 else
5455 result = stringlib_rfind_slice(
5456 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5457 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5458 start, end
5459 );
5460
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005462 Py_DECREF(sub);
5463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 return result;
5465}
5466
Tim Petersced69f82003-09-16 20:30:58 +00005467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468int tailmatch(PyUnicodeObject *self,
5469 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 Py_ssize_t start,
5471 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 int direction)
5473{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 if (substring->length == 0)
5475 return 1;
5476
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
5479 end -= substring->length;
5480 if (end < start)
5481 return 0;
5482
5483 if (direction > 0) {
5484 if (Py_UNICODE_MATCH(self, end, substring))
5485 return 1;
5486 } else {
5487 if (Py_UNICODE_MATCH(self, start, substring))
5488 return 1;
5489 }
5490
5491 return 0;
5492}
5493
Martin v. Löwis18e16552006-02-15 17:27:45 +00005494Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005496 Py_ssize_t start,
5497 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 int direction)
5499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005501
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 str = PyUnicode_FromObject(str);
5503 if (str == NULL)
5504 return -1;
5505 substr = PyUnicode_FromObject(substr);
5506 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005507 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 return -1;
5509 }
Tim Petersced69f82003-09-16 20:30:58 +00005510
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 result = tailmatch((PyUnicodeObject *)str,
5512 (PyUnicodeObject *)substr,
5513 start, end, direction);
5514 Py_DECREF(str);
5515 Py_DECREF(substr);
5516 return result;
5517}
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519/* Apply fixfct filter to the Unicode object self and return a
5520 reference to the modified object */
5521
Tim Petersced69f82003-09-16 20:30:58 +00005522static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523PyObject *fixup(PyUnicodeObject *self,
5524 int (*fixfct)(PyUnicodeObject *s))
5525{
5526
5527 PyUnicodeObject *u;
5528
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005529 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 if (u == NULL)
5531 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005532
5533 Py_UNICODE_COPY(u->str, self->str, self->length);
5534
Tim Peters7a29bd52001-09-12 03:03:31 +00005535 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 /* fixfct should return TRUE if it modified the buffer. If
5537 FALSE, return a reference to the original buffer instead
5538 (to save space, not time) */
5539 Py_INCREF(self);
5540 Py_DECREF(u);
5541 return (PyObject*) self;
5542 }
5543 return (PyObject*) u;
5544}
5545
Tim Petersced69f82003-09-16 20:30:58 +00005546static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547int fixupper(PyUnicodeObject *self)
5548{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005549 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 Py_UNICODE *s = self->str;
5551 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 while (len-- > 0) {
5554 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 ch = Py_UNICODE_TOUPPER(*s);
5557 if (ch != *s) {
5558 status = 1;
5559 *s = ch;
5560 }
5561 s++;
5562 }
5563
5564 return status;
5565}
5566
Tim Petersced69f82003-09-16 20:30:58 +00005567static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568int fixlower(PyUnicodeObject *self)
5569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005570 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_UNICODE *s = self->str;
5572 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 while (len-- > 0) {
5575 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 ch = Py_UNICODE_TOLOWER(*s);
5578 if (ch != *s) {
5579 status = 1;
5580 *s = ch;
5581 }
5582 s++;
5583 }
5584
5585 return status;
5586}
5587
Tim Petersced69f82003-09-16 20:30:58 +00005588static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589int fixswapcase(PyUnicodeObject *self)
5590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 Py_UNICODE *s = self->str;
5593 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 while (len-- > 0) {
5596 if (Py_UNICODE_ISUPPER(*s)) {
5597 *s = Py_UNICODE_TOLOWER(*s);
5598 status = 1;
5599 } else if (Py_UNICODE_ISLOWER(*s)) {
5600 *s = Py_UNICODE_TOUPPER(*s);
5601 status = 1;
5602 }
5603 s++;
5604 }
5605
5606 return status;
5607}
5608
Tim Petersced69f82003-09-16 20:30:58 +00005609static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610int fixcapitalize(PyUnicodeObject *self)
5611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005613 Py_UNICODE *s = self->str;
5614 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005615
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005616 if (len == 0)
5617 return 0;
5618 if (Py_UNICODE_ISLOWER(*s)) {
5619 *s = Py_UNICODE_TOUPPER(*s);
5620 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005622 s++;
5623 while (--len > 0) {
5624 if (Py_UNICODE_ISUPPER(*s)) {
5625 *s = Py_UNICODE_TOLOWER(*s);
5626 status = 1;
5627 }
5628 s++;
5629 }
5630 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631}
5632
5633static
5634int fixtitle(PyUnicodeObject *self)
5635{
5636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5637 register Py_UNICODE *e;
5638 int previous_is_cased;
5639
5640 /* Shortcut for single character strings */
5641 if (PyUnicode_GET_SIZE(self) == 1) {
5642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5643 if (*p != ch) {
5644 *p = ch;
5645 return 1;
5646 }
5647 else
5648 return 0;
5649 }
Tim Petersced69f82003-09-16 20:30:58 +00005650
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 e = p + PyUnicode_GET_SIZE(self);
5652 previous_is_cased = 0;
5653 for (; p < e; p++) {
5654 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005655
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 if (previous_is_cased)
5657 *p = Py_UNICODE_TOLOWER(ch);
5658 else
5659 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005660
5661 if (Py_UNICODE_ISLOWER(ch) ||
5662 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 Py_UNICODE_ISTITLE(ch))
5664 previous_is_cased = 1;
5665 else
5666 previous_is_cased = 0;
5667 }
5668 return 1;
5669}
5670
Tim Peters8ce9f162004-08-27 01:49:32 +00005671PyObject *
5672PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673{
Skip Montanaro6543b452004-09-16 03:28:13 +00005674 const Py_UNICODE blank = ' ';
5675 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005676 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005677 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005678 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5679 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005680 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5681 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005682 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005683 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 fseq = PySequence_Fast(seq, "");
5686 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005687 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005688 }
5689
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005690 /* NOTE: the following code can't call back into Python code,
5691 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005692 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005693
Tim Peters05eba1f2004-08-27 21:32:02 +00005694 seqlen = PySequence_Fast_GET_SIZE(fseq);
5695 /* If empty sequence, return u"". */
5696 if (seqlen == 0) {
5697 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5698 goto Done;
5699 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005700 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005701 /* If singleton sequence with an exact Unicode, return that. */
5702 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005703 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005704 if (PyUnicode_CheckExact(item)) {
5705 Py_INCREF(item);
5706 res = (PyUnicodeObject *)item;
5707 goto Done;
5708 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005709 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005710 else {
5711 /* Set up sep and seplen */
5712 if (separator == NULL) {
5713 sep = &blank;
5714 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005715 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005716 else {
5717 if (!PyUnicode_Check(separator)) {
5718 PyErr_Format(PyExc_TypeError,
5719 "separator: expected str instance,"
5720 " %.80s found",
5721 Py_TYPE(separator)->tp_name);
5722 goto onError;
5723 }
5724 sep = PyUnicode_AS_UNICODE(separator);
5725 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005726 }
5727 }
5728
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005729 /* There are at least two things to join, or else we have a subclass
5730 * of str in the sequence.
5731 * Do a pre-pass to figure out the total amount of space we'll
5732 * need (sz), and see whether all argument are strings.
5733 */
5734 sz = 0;
5735 for (i = 0; i < seqlen; i++) {
5736 const Py_ssize_t old_sz = sz;
5737 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005738 if (!PyUnicode_Check(item)) {
5739 PyErr_Format(PyExc_TypeError,
5740 "sequence item %zd: expected str instance,"
5741 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005742 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005743 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005744 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005745 sz += PyUnicode_GET_SIZE(item);
5746 if (i != 0)
5747 sz += seplen;
5748 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5749 PyErr_SetString(PyExc_OverflowError,
5750 "join() result is too long for a Python string");
5751 goto onError;
5752 }
5753 }
Tim Petersced69f82003-09-16 20:30:58 +00005754
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005755 res = _PyUnicode_New(sz);
5756 if (res == NULL)
5757 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005758
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005759 /* Catenate everything. */
5760 res_p = PyUnicode_AS_UNICODE(res);
5761 for (i = 0; i < seqlen; ++i) {
5762 Py_ssize_t itemlen;
5763 item = items[i];
5764 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005765 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005766 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005767 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005768 res_p += seplen;
5769 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005770 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5771 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005772 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005773
Tim Peters8ce9f162004-08-27 01:49:32 +00005774 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005775 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 return (PyObject *)res;
5777
5778 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005779 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005780 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 return NULL;
5782}
5783
Tim Petersced69f82003-09-16 20:30:58 +00005784static
5785PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005786 Py_ssize_t left,
5787 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 Py_UNICODE fill)
5789{
5790 PyUnicodeObject *u;
5791
5792 if (left < 0)
5793 left = 0;
5794 if (right < 0)
5795 right = 0;
5796
Tim Peters7a29bd52001-09-12 03:03:31 +00005797 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 Py_INCREF(self);
5799 return self;
5800 }
5801
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005802 if (left > PY_SSIZE_T_MAX - self->length ||
5803 right > PY_SSIZE_T_MAX - (left + self->length)) {
5804 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5805 return NULL;
5806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 u = _PyUnicode_New(left + self->length + right);
5808 if (u) {
5809 if (left)
5810 Py_UNICODE_FILL(u->str, fill, left);
5811 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5812 if (right)
5813 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5814 }
5815
5816 return u;
5817}
5818
5819#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 if (!str) \
5822 goto onError; \
5823 if (PyList_Append(list, str)) { \
5824 Py_DECREF(str); \
5825 goto onError; \
5826 } \
5827 else \
5828 Py_DECREF(str);
5829
5830static
5831PyObject *split_whitespace(PyUnicodeObject *self,
5832 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 register Py_ssize_t i;
5836 register Py_ssize_t j;
5837 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005839 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
5841 for (i = j = 0; i < len; ) {
5842 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005843 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 i++;
5845 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005846 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 i++;
5848 if (j < i) {
5849 if (maxcount-- <= 0)
5850 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005851 SPLIT_APPEND(buf, j, i);
5852 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 i++;
5854 j = i;
5855 }
5856 }
5857 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005858 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
5860 return list;
5861
5862 onError:
5863 Py_DECREF(list);
5864 return NULL;
5865}
5866
5867PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005868 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 register Py_ssize_t i;
5871 register Py_ssize_t j;
5872 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 PyObject *list;
5874 PyObject *str;
5875 Py_UNICODE *data;
5876
5877 string = PyUnicode_FromObject(string);
5878 if (string == NULL)
5879 return NULL;
5880 data = PyUnicode_AS_UNICODE(string);
5881 len = PyUnicode_GET_SIZE(string);
5882
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 list = PyList_New(0);
5884 if (!list)
5885 goto onError;
5886
5887 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005888 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005891 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
5894 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005895 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (i < len) {
5897 if (data[i] == '\r' && i + 1 < len &&
5898 data[i+1] == '\n')
5899 i += 2;
5900 else
5901 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005902 if (keepends)
5903 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossum86662912000-04-11 15:38:46 +00005905 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 j = i;
5907 }
5908 if (j < len) {
5909 SPLIT_APPEND(data, j, len);
5910 }
5911
5912 Py_DECREF(string);
5913 return list;
5914
5915 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005916 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 Py_DECREF(string);
5918 return NULL;
5919}
5920
Tim Petersced69f82003-09-16 20:30:58 +00005921static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922PyObject *split_char(PyUnicodeObject *self,
5923 PyObject *list,
5924 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005927 register Py_ssize_t i;
5928 register Py_ssize_t j;
5929 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005931 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
5933 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005934 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (maxcount-- <= 0)
5936 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005937 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 i = j = i + 1;
5939 } else
5940 i++;
5941 }
5942 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005943 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
5945 return list;
5946
5947 onError:
5948 Py_DECREF(list);
5949 return NULL;
5950}
5951
Tim Petersced69f82003-09-16 20:30:58 +00005952static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953PyObject *split_substring(PyUnicodeObject *self,
5954 PyObject *list,
5955 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958 register Py_ssize_t i;
5959 register Py_ssize_t j;
5960 Py_ssize_t len = self->length;
5961 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 PyObject *str;
5963
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005964 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 if (Py_UNICODE_MATCH(self, i, substring)) {
5966 if (maxcount-- <= 0)
5967 break;
5968 SPLIT_APPEND(self->str, j, i);
5969 i = j = i + sublen;
5970 } else
5971 i++;
5972 }
5973 if (j <= len) {
5974 SPLIT_APPEND(self->str, j, len);
5975 }
5976 return list;
5977
5978 onError:
5979 Py_DECREF(list);
5980 return NULL;
5981}
5982
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005983static
5984PyObject *rsplit_whitespace(PyUnicodeObject *self,
5985 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005986 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005987{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005988 register Py_ssize_t i;
5989 register Py_ssize_t j;
5990 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005992 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005993
5994 for (i = j = len - 1; i >= 0; ) {
5995 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005996 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005997 i--;
5998 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005999 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006000 i--;
6001 if (j > i) {
6002 if (maxcount-- <= 0)
6003 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006004 SPLIT_APPEND(buf, i + 1, j + 1);
6005 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006006 i--;
6007 j = i;
6008 }
6009 }
6010 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006011 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013 if (PyList_Reverse(list) < 0)
6014 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006015 return list;
6016
6017 onError:
6018 Py_DECREF(list);
6019 return NULL;
6020}
6021
6022static
6023PyObject *rsplit_char(PyUnicodeObject *self,
6024 PyObject *list,
6025 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006028 register Py_ssize_t i;
6029 register Py_ssize_t j;
6030 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006031 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006032 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006033
6034 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006035 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006036 if (maxcount-- <= 0)
6037 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006038 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006039 j = i = i - 1;
6040 } else
6041 i--;
6042 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006043 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006044 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006045 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006046 if (PyList_Reverse(list) < 0)
6047 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006048 return list;
6049
6050 onError:
6051 Py_DECREF(list);
6052 return NULL;
6053}
6054
6055static
6056PyObject *rsplit_substring(PyUnicodeObject *self,
6057 PyObject *list,
6058 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006060{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061 register Py_ssize_t i;
6062 register Py_ssize_t j;
6063 Py_ssize_t len = self->length;
6064 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006065 PyObject *str;
6066
6067 for (i = len - sublen, j = len; i >= 0; ) {
6068 if (Py_UNICODE_MATCH(self, i, substring)) {
6069 if (maxcount-- <= 0)
6070 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006071 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006072 j = i;
6073 i -= sublen;
6074 } else
6075 i--;
6076 }
6077 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006079 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006080 if (PyList_Reverse(list) < 0)
6081 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006082 return list;
6083
6084 onError:
6085 Py_DECREF(list);
6086 return NULL;
6087}
6088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089#undef SPLIT_APPEND
6090
6091static
6092PyObject *split(PyUnicodeObject *self,
6093 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
6096 PyObject *list;
6097
6098 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006099 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101 list = PyList_New(0);
6102 if (!list)
6103 return NULL;
6104
6105 if (substring == NULL)
6106 return split_whitespace(self,list,maxcount);
6107
6108 else if (substring->length == 1)
6109 return split_char(self,list,substring->str[0],maxcount);
6110
6111 else if (substring->length == 0) {
6112 Py_DECREF(list);
6113 PyErr_SetString(PyExc_ValueError, "empty separator");
6114 return NULL;
6115 }
6116 else
6117 return split_substring(self,list,substring,maxcount);
6118}
6119
Tim Petersced69f82003-09-16 20:30:58 +00006120static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006121PyObject *rsplit(PyUnicodeObject *self,
6122 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006124{
6125 PyObject *list;
6126
6127 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006128 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006129
6130 list = PyList_New(0);
6131 if (!list)
6132 return NULL;
6133
6134 if (substring == NULL)
6135 return rsplit_whitespace(self,list,maxcount);
6136
6137 else if (substring->length == 1)
6138 return rsplit_char(self,list,substring->str[0],maxcount);
6139
6140 else if (substring->length == 0) {
6141 Py_DECREF(list);
6142 PyErr_SetString(PyExc_ValueError, "empty separator");
6143 return NULL;
6144 }
6145 else
6146 return rsplit_substring(self,list,substring,maxcount);
6147}
6148
6149static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150PyObject *replace(PyUnicodeObject *self,
6151 PyUnicodeObject *str1,
6152 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
6155 PyUnicodeObject *u;
6156
6157 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006158 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Thomas Wouters477c8d52006-05-27 19:21:47 +00006160 if (str1->length == str2->length) {
6161 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006162 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 if (str1->length == 1) {
6164 /* replace characters */
6165 Py_UNICODE u1, u2;
6166 if (!findchar(self->str, self->length, str1->str[0]))
6167 goto nothing;
6168 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6169 if (!u)
6170 return NULL;
6171 Py_UNICODE_COPY(u->str, self->str, self->length);
6172 u1 = str1->str[0];
6173 u2 = str2->str[0];
6174 for (i = 0; i < u->length; i++)
6175 if (u->str[i] == u1) {
6176 if (--maxcount < 0)
6177 break;
6178 u->str[i] = u2;
6179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006181 i = fastsearch(
6182 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006184 if (i < 0)
6185 goto nothing;
6186 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6187 if (!u)
6188 return NULL;
6189 Py_UNICODE_COPY(u->str, self->str, self->length);
6190 while (i <= self->length - str1->length)
6191 if (Py_UNICODE_MATCH(self, i, str1)) {
6192 if (--maxcount < 0)
6193 break;
6194 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6195 i += str1->length;
6196 } else
6197 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006200
6201 Py_ssize_t n, i, j, e;
6202 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 Py_UNICODE *p;
6204
6205 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006206 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 if (n > maxcount)
6208 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006209 if (n == 0)
6210 goto nothing;
6211 /* new_size = self->length + n * (str2->length - str1->length)); */
6212 delta = (str2->length - str1->length);
6213 if (delta == 0) {
6214 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006216 product = n * (str2->length - str1->length);
6217 if ((product / (str2->length - str1->length)) != n) {
6218 PyErr_SetString(PyExc_OverflowError,
6219 "replace string is too long");
6220 return NULL;
6221 }
6222 new_size = self->length + product;
6223 if (new_size < 0) {
6224 PyErr_SetString(PyExc_OverflowError,
6225 "replace string is too long");
6226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 }
6228 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006229 u = _PyUnicode_New(new_size);
6230 if (!u)
6231 return NULL;
6232 i = 0;
6233 p = u->str;
6234 e = self->length - str1->length;
6235 if (str1->length > 0) {
6236 while (n-- > 0) {
6237 /* look for next match */
6238 j = i;
6239 while (j <= e) {
6240 if (Py_UNICODE_MATCH(self, j, str1))
6241 break;
6242 j++;
6243 }
6244 if (j > i) {
6245 if (j > e)
6246 break;
6247 /* copy unchanged part [i:j] */
6248 Py_UNICODE_COPY(p, self->str+i, j-i);
6249 p += j - i;
6250 }
6251 /* copy substitution string */
6252 if (str2->length > 0) {
6253 Py_UNICODE_COPY(p, str2->str, str2->length);
6254 p += str2->length;
6255 }
6256 i = j + str1->length;
6257 }
6258 if (i < self->length)
6259 /* copy tail [i:] */
6260 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6261 } else {
6262 /* interleave */
6263 while (n > 0) {
6264 Py_UNICODE_COPY(p, str2->str, str2->length);
6265 p += str2->length;
6266 if (--n <= 0)
6267 break;
6268 *p++ = self->str[i++];
6269 }
6270 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006274
6275nothing:
6276 /* nothing to replace; return original string (when possible) */
6277 if (PyUnicode_CheckExact(self)) {
6278 Py_INCREF(self);
6279 return (PyObject *) self;
6280 }
6281 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282}
6283
6284/* --- Unicode Object Methods --------------------------------------------- */
6285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006286PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006287"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288\n\
6289Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006290characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291
6292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006293unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 return fixup(self, fixtitle);
6296}
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006299"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300\n\
6301Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006302have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303
6304static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006305unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 return fixup(self, fixcapitalize);
6308}
6309
6310#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006311PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006312"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313\n\
6314Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006315normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316
6317static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006318unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
6320 PyObject *list;
6321 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006322 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 /* Split into words */
6325 list = split(self, NULL, -1);
6326 if (!list)
6327 return NULL;
6328
6329 /* Capitalize each word */
6330 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6331 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6332 fixcapitalize);
6333 if (item == NULL)
6334 goto onError;
6335 Py_DECREF(PyList_GET_ITEM(list, i));
6336 PyList_SET_ITEM(list, i, item);
6337 }
6338
6339 /* Join the words to form a new string */
6340 item = PyUnicode_Join(NULL, list);
6341
6342onError:
6343 Py_DECREF(list);
6344 return (PyObject *)item;
6345}
6346#endif
6347
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006348/* Argument converter. Coerces to a single unicode character */
6349
6350static int
6351convert_uc(PyObject *obj, void *addr)
6352{
6353 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6354 PyObject *uniobj;
6355 Py_UNICODE *unistr;
6356
6357 uniobj = PyUnicode_FromObject(obj);
6358 if (uniobj == NULL) {
6359 PyErr_SetString(PyExc_TypeError,
6360 "The fill character cannot be converted to Unicode");
6361 return 0;
6362 }
6363 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6364 PyErr_SetString(PyExc_TypeError,
6365 "The fill character must be exactly one character long");
6366 Py_DECREF(uniobj);
6367 return 0;
6368 }
6369 unistr = PyUnicode_AS_UNICODE(uniobj);
6370 *fillcharloc = unistr[0];
6371 Py_DECREF(uniobj);
6372 return 1;
6373}
6374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006375PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006376"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006378Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006379done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380
6381static PyObject *
6382unicode_center(PyUnicodeObject *self, PyObject *args)
6383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006384 Py_ssize_t marg, left;
6385 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006386 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Thomas Woutersde017742006-02-16 19:34:37 +00006388 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 return NULL;
6390
Tim Peters7a29bd52001-09-12 03:03:31 +00006391 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 Py_INCREF(self);
6393 return (PyObject*) self;
6394 }
6395
6396 marg = width - self->length;
6397 left = marg / 2 + (marg & width & 1);
6398
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006399 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400}
6401
Marc-André Lemburge5034372000-08-08 08:04:29 +00006402#if 0
6403
6404/* This code should go into some future Unicode collation support
6405 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006406 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006407
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006408/* speedy UTF-16 code point order comparison */
6409/* gleaned from: */
6410/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006412static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006413{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006414 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006415 0, 0, 0, 0, 0, 0, 0, 0,
6416 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006417 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006418};
6419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420static int
6421unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6422{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006423 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 Py_UNICODE *s1 = str1->str;
6426 Py_UNICODE *s2 = str2->str;
6427
6428 len1 = str1->length;
6429 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006432 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006433
6434 c1 = *s1++;
6435 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006436
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006437 if (c1 > (1<<11) * 26)
6438 c1 += utf16Fixup[c1>>11];
6439 if (c2 > (1<<11) * 26)
6440 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006441 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006442
6443 if (c1 != c2)
6444 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006445
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006446 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
6448
6449 return (len1 < len2) ? -1 : (len1 != len2);
6450}
6451
Marc-André Lemburge5034372000-08-08 08:04:29 +00006452#else
6453
6454static int
6455unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006458
6459 Py_UNICODE *s1 = str1->str;
6460 Py_UNICODE *s2 = str2->str;
6461
6462 len1 = str1->length;
6463 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006464
Marc-André Lemburge5034372000-08-08 08:04:29 +00006465 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006466 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006467
Fredrik Lundh45714e92001-06-26 16:39:36 +00006468 c1 = *s1++;
6469 c2 = *s2++;
6470
6471 if (c1 != c2)
6472 return (c1 < c2) ? -1 : 1;
6473
Marc-André Lemburge5034372000-08-08 08:04:29 +00006474 len1--; len2--;
6475 }
6476
6477 return (len1 < len2) ? -1 : (len1 != len2);
6478}
6479
6480#endif
6481
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482int PyUnicode_Compare(PyObject *left,
6483 PyObject *right)
6484{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006485 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6486 return unicode_compare((PyUnicodeObject *)left,
6487 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006488 PyErr_Format(PyExc_TypeError,
6489 "Can't compare %.100s and %.100s",
6490 left->ob_type->tp_name,
6491 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 return -1;
6493}
6494
Martin v. Löwis5b222132007-06-10 09:51:05 +00006495int
6496PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6497{
6498 int i;
6499 Py_UNICODE *id;
6500 assert(PyUnicode_Check(uni));
6501 id = PyUnicode_AS_UNICODE(uni);
6502 /* Compare Unicode string and source character set string */
6503 for (i = 0; id[i] && str[i]; i++)
6504 if (id[i] != str[i])
6505 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6506 if (id[i])
6507 return 1; /* uni is longer */
6508 if (str[i])
6509 return -1; /* str is longer */
6510 return 0;
6511}
6512
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006513
6514#define TEST_COND(cond) \
6515 ((cond) ? Py_True : Py_False)
6516
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006517PyObject *PyUnicode_RichCompare(PyObject *left,
6518 PyObject *right,
6519 int op)
6520{
6521 int result;
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006522
6523 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6524 PyObject *v;
6525 if (((PyUnicodeObject *) left)->length !=
6526 ((PyUnicodeObject *) right)->length) {
6527 if (op == Py_EQ) {
6528 Py_INCREF(Py_False);
6529 return Py_False;
6530 }
6531 if (op == Py_NE) {
6532 Py_INCREF(Py_True);
6533 return Py_True;
6534 }
6535 }
6536 if (left == right)
6537 result = 0;
6538 else
6539 result = unicode_compare((PyUnicodeObject *)left,
6540 (PyUnicodeObject *)right);
6541
6542 /* Convert the return value to a Boolean */
6543 switch (op) {
6544 case Py_EQ:
6545 v = TEST_COND(result == 0);
6546 break;
6547 case Py_NE:
6548 v = TEST_COND(result != 0);
6549 break;
6550 case Py_LE:
6551 v = TEST_COND(result <= 0);
6552 break;
6553 case Py_GE:
6554 v = TEST_COND(result >= 0);
6555 break;
6556 case Py_LT:
6557 v = TEST_COND(result == -1);
6558 break;
6559 case Py_GT:
6560 v = TEST_COND(result == 1);
6561 break;
6562 default:
6563 PyErr_BadArgument();
6564 return NULL;
6565 }
6566 Py_INCREF(v);
6567 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006568 }
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006569
6570 Py_INCREF(Py_NotImplemented);
6571 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006572}
6573
Guido van Rossum403d68b2000-03-13 15:55:09 +00006574int PyUnicode_Contains(PyObject *container,
6575 PyObject *element)
6576{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006577 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006579
6580 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 sub = PyUnicode_FromObject(element);
6582 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006583 PyErr_Format(PyExc_TypeError,
6584 "'in <string>' requires string as left operand, not %s",
6585 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006587 }
6588
Thomas Wouters477c8d52006-05-27 19:21:47 +00006589 str = PyUnicode_FromObject(container);
6590 if (!str) {
6591 Py_DECREF(sub);
6592 return -1;
6593 }
6594
6595 result = stringlib_contains_obj(str, sub);
6596
6597 Py_DECREF(str);
6598 Py_DECREF(sub);
6599
Guido van Rossum403d68b2000-03-13 15:55:09 +00006600 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* Concat to string or Unicode object giving a new Unicode object. */
6604
6605PyObject *PyUnicode_Concat(PyObject *left,
6606 PyObject *right)
6607{
6608 PyUnicodeObject *u = NULL, *v = NULL, *w;
6609
6610 /* Coerce the two arguments */
6611 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6612 if (u == NULL)
6613 goto onError;
6614 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6615 if (v == NULL)
6616 goto onError;
6617
6618 /* Shortcuts */
6619 if (v == unicode_empty) {
6620 Py_DECREF(v);
6621 return (PyObject *)u;
6622 }
6623 if (u == unicode_empty) {
6624 Py_DECREF(u);
6625 return (PyObject *)v;
6626 }
6627
6628 /* Concat the two Unicode strings */
6629 w = _PyUnicode_New(u->length + v->length);
6630 if (w == NULL)
6631 goto onError;
6632 Py_UNICODE_COPY(w->str, u->str, u->length);
6633 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6634
6635 Py_DECREF(u);
6636 Py_DECREF(v);
6637 return (PyObject *)w;
6638
6639onError:
6640 Py_XDECREF(u);
6641 Py_XDECREF(v);
6642 return NULL;
6643}
6644
Walter Dörwald1ab83302007-05-18 17:15:44 +00006645void
6646PyUnicode_Append(PyObject **pleft, PyObject *right)
6647{
6648 PyObject *new;
6649 if (*pleft == NULL)
6650 return;
6651 if (right == NULL || !PyUnicode_Check(*pleft)) {
6652 Py_DECREF(*pleft);
6653 *pleft = NULL;
6654 return;
6655 }
6656 new = PyUnicode_Concat(*pleft, right);
6657 Py_DECREF(*pleft);
6658 *pleft = new;
6659}
6660
6661void
6662PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6663{
6664 PyUnicode_Append(pleft, right);
6665 Py_XDECREF(right);
6666}
6667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669"S.count(sub[, start[, end]]) -> int\n\
6670\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006671Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006672string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675static PyObject *
6676unicode_count(PyUnicodeObject *self, PyObject *args)
6677{
6678 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006680 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 PyObject *result;
6682
Guido van Rossumb8872e62000-05-09 14:14:27 +00006683 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6684 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 return NULL;
6686
6687 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 if (substring == NULL)
6690 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Christian Heimes217cfd12007-12-02 14:31:20 +00006694 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006695 stringlib_count(self->str + start, end - start,
6696 substring->str, substring->length)
6697 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 return result;
6702}
6703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006705"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006707Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006708to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006709handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6711'xmlcharrefreplace' as well as any other name registered with\n\
6712codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject *
6715unicode_encode(PyUnicodeObject *self, PyObject *args)
6716{
6717 char *encoding = NULL;
6718 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006719 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6722 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006723 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006724 if (v == NULL)
6725 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006726 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006727 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006728 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006729 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006730 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006731 Py_DECREF(v);
6732 return NULL;
6733 }
6734 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006735
6736 onError:
6737 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006738}
6739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006741"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742\n\
6743Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006744If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
6746static PyObject*
6747unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6748{
6749 Py_UNICODE *e;
6750 Py_UNICODE *p;
6751 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006752 Py_UNICODE *qe;
6753 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 PyUnicodeObject *u;
6755 int tabsize = 8;
6756
6757 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6758 return NULL;
6759
Thomas Wouters7e474022000-07-16 12:04:32 +00006760 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006761 i = 0; /* chars up to and including most recent \n or \r */
6762 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6763 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 for (p = self->str; p < e; p++)
6765 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006766 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006767 incr = tabsize - (j % tabsize); /* cannot overflow */
6768 if (j > PY_SSIZE_T_MAX - incr)
6769 goto overflow1;
6770 j += incr;
6771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 }
6773 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006774 if (j > PY_SSIZE_T_MAX - 1)
6775 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 j++;
6777 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006778 if (i > PY_SSIZE_T_MAX - j)
6779 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006781 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 }
6783 }
6784
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006785 if (i > PY_SSIZE_T_MAX - j)
6786 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 /* Second pass: create output string and fill it */
6789 u = _PyUnicode_New(i + j);
6790 if (!u)
6791 return NULL;
6792
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006793 j = 0; /* same as in first pass */
6794 q = u->str; /* next output char */
6795 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
6797 for (p = self->str; p < e; p++)
6798 if (*p == '\t') {
6799 if (tabsize > 0) {
6800 i = tabsize - (j % tabsize);
6801 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006802 while (i--) {
6803 if (q >= qe)
6804 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
6808 }
6809 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006810 if (q >= qe)
6811 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006813 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 if (*p == '\n' || *p == '\r')
6815 j = 0;
6816 }
6817
6818 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006819
6820 overflow2:
6821 Py_DECREF(u);
6822 overflow1:
6823 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825}
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006828"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829\n\
6830Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006831such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832arguments start and end are interpreted as in slice notation.\n\
6833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006834Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
6836static PyObject *
6837unicode_find(PyUnicodeObject *self, PyObject *args)
6838{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006839 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006840 Py_ssize_t start;
6841 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006842 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Christian Heimes9cd17752007-11-18 19:35:23 +00006844 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846
Thomas Wouters477c8d52006-05-27 19:21:47 +00006847 result = stringlib_find_slice(
6848 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6849 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6850 start, end
6851 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
6853 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006854
Christian Heimes217cfd12007-12-02 14:31:20 +00006855 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856}
6857
6858static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006859unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860{
6861 if (index < 0 || index >= self->length) {
6862 PyErr_SetString(PyExc_IndexError, "string index out of range");
6863 return NULL;
6864 }
6865
6866 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6867}
6868
Guido van Rossumc2504932007-09-18 19:42:40 +00006869/* Believe it or not, this produces the same value for ASCII strings
6870 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006872unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
Guido van Rossumc2504932007-09-18 19:42:40 +00006874 Py_ssize_t len;
6875 Py_UNICODE *p;
6876 long x;
6877
6878 if (self->hash != -1)
6879 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006880 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006881 p = self->str;
6882 x = *p << 7;
6883 while (--len >= 0)
6884 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006885 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006886 if (x == -1)
6887 x = -2;
6888 self->hash = x;
6889 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890}
6891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006893"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006895Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897static PyObject *
6898unicode_index(PyUnicodeObject *self, PyObject *args)
6899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006901 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006902 Py_ssize_t start;
6903 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
Christian Heimes9cd17752007-11-18 19:35:23 +00006905 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
Thomas Wouters477c8d52006-05-27 19:21:47 +00006908 result = stringlib_find_slice(
6909 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6910 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6911 start, end
6912 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
6914 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 if (result < 0) {
6917 PyErr_SetString(PyExc_ValueError, "substring not found");
6918 return NULL;
6919 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006920
Christian Heimes217cfd12007-12-02 14:31:20 +00006921 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006931unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932{
6933 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6934 register const Py_UNICODE *e;
6935 int cased;
6936
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 /* Shortcut for single character strings */
6938 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006941 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006942 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 e = p + PyUnicode_GET_SIZE(self);
6946 cased = 0;
6947 for (; p < e; p++) {
6948 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 else if (!cased && Py_UNICODE_ISLOWER(ch))
6953 cased = 1;
6954 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956}
6957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006958PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006961Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963
6964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006965unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966{
6967 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6968 register const Py_UNICODE *e;
6969 int cased;
6970
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 /* Shortcut for single character strings */
6972 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006975 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006976 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 e = p + PyUnicode_GET_SIZE(self);
6980 cased = 0;
6981 for (; p < e; p++) {
6982 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 else if (!cased && Py_UNICODE_ISUPPER(ch))
6987 cased = 1;
6988 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990}
6991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006995Return True if S is a titlecased string and there is at least one\n\
6996character in S, i.e. upper- and titlecase characters may only\n\
6997follow uncased characters and lowercase characters only cased ones.\n\
6998Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
7000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007001unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002{
7003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7004 register const Py_UNICODE *e;
7005 int cased, previous_is_cased;
7006
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 /* Shortcut for single character strings */
7008 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7010 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007012 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007013 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007015
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 e = p + PyUnicode_GET_SIZE(self);
7017 cased = 0;
7018 previous_is_cased = 0;
7019 for (; p < e; p++) {
7020 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007021
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7023 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007024 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 previous_is_cased = 1;
7026 cased = 1;
7027 }
7028 else if (Py_UNICODE_ISLOWER(ch)) {
7029 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007030 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 previous_is_cased = 1;
7032 cased = 1;
7033 }
7034 else
7035 previous_is_cased = 0;
7036 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007037 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007041"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007043Return True if all characters in S are whitespace\n\
7044and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
7049 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7050 register const Py_UNICODE *e;
7051
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 /* Shortcut for single character strings */
7053 if (PyUnicode_GET_SIZE(self) == 1 &&
7054 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007055 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007057 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007058 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007059 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 e = p + PyUnicode_GET_SIZE(self);
7062 for (; p < e; p++) {
7063 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007064 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007066 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067}
7068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007069PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007070"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007071\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007072Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007073and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007074
7075static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007076unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007077{
7078 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7079 register const Py_UNICODE *e;
7080
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007081 /* Shortcut for single character strings */
7082 if (PyUnicode_GET_SIZE(self) == 1 &&
7083 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007084 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007085
7086 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007087 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007088 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089
7090 e = p + PyUnicode_GET_SIZE(self);
7091 for (; p < e; p++) {
7092 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007093 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007094 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007095 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007096}
7097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007099"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007100\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007101Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007103
7104static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007105unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007106{
7107 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7108 register const Py_UNICODE *e;
7109
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007110 /* Shortcut for single character strings */
7111 if (PyUnicode_GET_SIZE(self) == 1 &&
7112 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007113 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007114
7115 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007116 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007117 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007118
7119 e = p + PyUnicode_GET_SIZE(self);
7120 for (; p < e; p++) {
7121 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007122 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007123 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007124 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007125}
7126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007128"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007130Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007134unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
7136 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7137 register const Py_UNICODE *e;
7138
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 /* Shortcut for single character strings */
7140 if (PyUnicode_GET_SIZE(self) == 1 &&
7141 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007142 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007144 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007145 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007146 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007147
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 e = p + PyUnicode_GET_SIZE(self);
7149 for (; p < e; p++) {
7150 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007151 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007153 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154}
7155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007157"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007159Return True if all characters in S are digits\n\
7160and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161
7162static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007163unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164{
7165 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7166 register const Py_UNICODE *e;
7167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 /* Shortcut for single character strings */
7169 if (PyUnicode_GET_SIZE(self) == 1 &&
7170 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007171 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007173 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007174 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007175 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007176
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 e = p + PyUnicode_GET_SIZE(self);
7178 for (; p < e; p++) {
7179 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007180 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007182 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183}
7184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007185PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007186"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007188Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
7191static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007192unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
7194 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7195 register const Py_UNICODE *e;
7196
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 /* Shortcut for single character strings */
7198 if (PyUnicode_GET_SIZE(self) == 1 &&
7199 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007200 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007202 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007203 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007205
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 e = p + PyUnicode_GET_SIZE(self);
7207 for (; p < e; p++) {
7208 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007209 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007211 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212}
7213
Martin v. Löwis47383402007-08-15 07:32:56 +00007214int
7215PyUnicode_IsIdentifier(PyObject *self)
7216{
7217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7218 register const Py_UNICODE *e;
7219
7220 /* Special case for empty strings */
7221 if (PyUnicode_GET_SIZE(self) == 0)
7222 return 0;
7223
7224 /* PEP 3131 says that the first character must be in
7225 XID_Start and subsequent characters in XID_Continue,
7226 and for the ASCII range, the 2.x rules apply (i.e
7227 start with letters and underscore, continue with
7228 letters, digits, underscore). However, given the current
7229 definition of XID_Start and XID_Continue, it is sufficient
7230 to check just for these, except that _ must be allowed
7231 as starting an identifier. */
7232 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7233 return 0;
7234
7235 e = p + PyUnicode_GET_SIZE(self);
7236 for (p++; p < e; p++) {
7237 if (!_PyUnicode_IsXidContinue(*p))
7238 return 0;
7239 }
7240 return 1;
7241}
7242
7243PyDoc_STRVAR(isidentifier__doc__,
7244"S.isidentifier() -> bool\n\
7245\n\
7246Return True if S is a valid identifier according\n\
7247to the language definition.");
7248
7249static PyObject*
7250unicode_isidentifier(PyObject *self)
7251{
7252 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7253}
7254
Georg Brandl559e5d72008-06-11 18:37:52 +00007255PyDoc_STRVAR(isprintable__doc__,
7256"S.isprintable() -> bool\n\
7257\n\
7258Return True if all characters in S are considered\n\
7259printable in repr() or S is empty, False otherwise.");
7260
7261static PyObject*
7262unicode_isprintable(PyObject *self)
7263{
7264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7265 register const Py_UNICODE *e;
7266
7267 /* Shortcut for single character strings */
7268 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7269 Py_RETURN_TRUE;
7270 }
7271
7272 e = p + PyUnicode_GET_SIZE(self);
7273 for (; p < e; p++) {
7274 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7275 Py_RETURN_FALSE;
7276 }
7277 }
7278 Py_RETURN_TRUE;
7279}
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007282"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283\n\
7284Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007285sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286
7287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007288unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007290 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291}
7292
Martin v. Löwis18e16552006-02-15 17:27:45 +00007293static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294unicode_length(PyUnicodeObject *self)
7295{
7296 return self->length;
7297}
7298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007299PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007300"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007302Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007303done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
7305static PyObject *
7306unicode_ljust(PyUnicodeObject *self, PyObject *args)
7307{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007308 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007309 Py_UNICODE fillchar = ' ';
7310
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007311 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 return NULL;
7313
Tim Peters7a29bd52001-09-12 03:03:31 +00007314 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 Py_INCREF(self);
7316 return (PyObject*) self;
7317 }
7318
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007319 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320}
7321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007323"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
7327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007328unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 return fixup(self, fixlower);
7331}
7332
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007333#define LEFTSTRIP 0
7334#define RIGHTSTRIP 1
7335#define BOTHSTRIP 2
7336
7337/* Arrays indexed by above */
7338static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7339
7340#define STRIPNAME(i) (stripformat[i]+3)
7341
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007342/* externally visible for str.strip(unicode) */
7343PyObject *
7344_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7345{
7346 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007348 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007349 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7350 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007351
Thomas Wouters477c8d52006-05-27 19:21:47 +00007352 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7353
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007354 i = 0;
7355 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007356 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7357 i++;
7358 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007359 }
7360
7361 j = len;
7362 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007363 do {
7364 j--;
7365 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7366 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007367 }
7368
7369 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007370 Py_INCREF(self);
7371 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007372 }
7373 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007374 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007375}
7376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007379do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007381 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007382 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007383
7384 i = 0;
7385 if (striptype != RIGHTSTRIP) {
7386 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7387 i++;
7388 }
7389 }
7390
7391 j = len;
7392 if (striptype != LEFTSTRIP) {
7393 do {
7394 j--;
7395 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7396 j++;
7397 }
7398
7399 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7400 Py_INCREF(self);
7401 return (PyObject*)self;
7402 }
7403 else
7404 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405}
7406
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007407
7408static PyObject *
7409do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7410{
7411 PyObject *sep = NULL;
7412
7413 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7414 return NULL;
7415
7416 if (sep != NULL && sep != Py_None) {
7417 if (PyUnicode_Check(sep))
7418 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007419 else {
7420 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007421 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007422 STRIPNAME(striptype));
7423 return NULL;
7424 }
7425 }
7426
7427 return do_strip(self, striptype);
7428}
7429
7430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007432"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007433\n\
7434Return a copy of the string S with leading and trailing\n\
7435whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007436If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007437
7438static PyObject *
7439unicode_strip(PyUnicodeObject *self, PyObject *args)
7440{
7441 if (PyTuple_GET_SIZE(args) == 0)
7442 return do_strip(self, BOTHSTRIP); /* Common case */
7443 else
7444 return do_argstrip(self, BOTHSTRIP, args);
7445}
7446
7447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007448PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007449"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007450\n\
7451Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007452If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007453
7454static PyObject *
7455unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7456{
7457 if (PyTuple_GET_SIZE(args) == 0)
7458 return do_strip(self, LEFTSTRIP); /* Common case */
7459 else
7460 return do_argstrip(self, LEFTSTRIP, args);
7461}
7462
7463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007464PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007465"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007466\n\
7467Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007468If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007469
7470static PyObject *
7471unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7472{
7473 if (PyTuple_GET_SIZE(args) == 0)
7474 return do_strip(self, RIGHTSTRIP); /* Common case */
7475 else
7476 return do_argstrip(self, RIGHTSTRIP, args);
7477}
7478
7479
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482{
7483 PyUnicodeObject *u;
7484 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007485 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007486 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
7488 if (len < 0)
7489 len = 0;
7490
Tim Peters7a29bd52001-09-12 03:03:31 +00007491 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 /* no repeat, return original string */
7493 Py_INCREF(str);
7494 return (PyObject*) str;
7495 }
Tim Peters8f422462000-09-09 06:13:41 +00007496
7497 /* ensure # of chars needed doesn't overflow int and # of bytes
7498 * needed doesn't overflow size_t
7499 */
7500 nchars = len * str->length;
7501 if (len && nchars / len != str->length) {
7502 PyErr_SetString(PyExc_OverflowError,
7503 "repeated string is too long");
7504 return NULL;
7505 }
7506 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7507 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7508 PyErr_SetString(PyExc_OverflowError,
7509 "repeated string is too long");
7510 return NULL;
7511 }
7512 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 if (!u)
7514 return NULL;
7515
7516 p = u->str;
7517
Thomas Wouters477c8d52006-05-27 19:21:47 +00007518 if (str->length == 1 && len > 0) {
7519 Py_UNICODE_FILL(p, str->str[0], len);
7520 } else {
7521 Py_ssize_t done = 0; /* number of characters copied this far */
7522 if (done < nchars) {
7523 Py_UNICODE_COPY(p, str->str, str->length);
7524 done = str->length;
7525 }
7526 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007527 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 Py_UNICODE_COPY(p+done, p, n);
7529 done += n;
7530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 }
7532
7533 return (PyObject*) u;
7534}
7535
7536PyObject *PyUnicode_Replace(PyObject *obj,
7537 PyObject *subobj,
7538 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007539 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540{
7541 PyObject *self;
7542 PyObject *str1;
7543 PyObject *str2;
7544 PyObject *result;
7545
7546 self = PyUnicode_FromObject(obj);
7547 if (self == NULL)
7548 return NULL;
7549 str1 = PyUnicode_FromObject(subobj);
7550 if (str1 == NULL) {
7551 Py_DECREF(self);
7552 return NULL;
7553 }
7554 str2 = PyUnicode_FromObject(replobj);
7555 if (str2 == NULL) {
7556 Py_DECREF(self);
7557 Py_DECREF(str1);
7558 return NULL;
7559 }
Tim Petersced69f82003-09-16 20:30:58 +00007560 result = replace((PyUnicodeObject *)self,
7561 (PyUnicodeObject *)str1,
7562 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 maxcount);
7564 Py_DECREF(self);
7565 Py_DECREF(str1);
7566 Py_DECREF(str2);
7567 return result;
7568}
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007571"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572\n\
7573Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007574old replaced by new. If the optional argument count is\n\
7575given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
7578unicode_replace(PyUnicodeObject *self, PyObject *args)
7579{
7580 PyUnicodeObject *str1;
7581 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 PyObject *result;
7584
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 return NULL;
7587 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7588 if (str1 == NULL)
7589 return NULL;
7590 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007591 if (str2 == NULL) {
7592 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
7596 result = replace(self, str1, str2, maxcount);
7597
7598 Py_DECREF(str1);
7599 Py_DECREF(str2);
7600 return result;
7601}
7602
7603static
7604PyObject *unicode_repr(PyObject *unicode)
7605{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007606 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007607 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007608 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7609 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7610
7611 /* XXX(nnorwitz): rather than over-allocating, it would be
7612 better to choose a different scheme. Perhaps scan the
7613 first N-chars of the string and allocate based on that size.
7614 */
7615 /* Initial allocation is based on the longest-possible unichr
7616 escape.
7617
7618 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7619 unichr, so in this case it's the longest unichr escape. In
7620 narrow (UTF-16) builds this is five chars per source unichr
7621 since there are two unichrs in the surrogate pair, so in narrow
7622 (UTF-16) builds it's not the longest unichr escape.
7623
7624 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7625 so in the narrow (UTF-16) build case it's the longest unichr
7626 escape.
7627 */
7628
Walter Dörwald1ab83302007-05-18 17:15:44 +00007629 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007630 2 /* quotes */
7631#ifdef Py_UNICODE_WIDE
7632 + 10*size
7633#else
7634 + 6*size
7635#endif
7636 + 1);
7637 if (repr == NULL)
7638 return NULL;
7639
Walter Dörwald1ab83302007-05-18 17:15:44 +00007640 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007641
7642 /* Add quote */
7643 *p++ = (findchar(s, size, '\'') &&
7644 !findchar(s, size, '"')) ? '"' : '\'';
7645 while (size-- > 0) {
7646 Py_UNICODE ch = *s++;
7647
7648 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007649 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007650 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007651 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007652 continue;
7653 }
7654
Georg Brandl559e5d72008-06-11 18:37:52 +00007655 /* Map special whitespace to '\t', \n', '\r' */
7656 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007657 *p++ = '\\';
7658 *p++ = 't';
7659 }
7660 else if (ch == '\n') {
7661 *p++ = '\\';
7662 *p++ = 'n';
7663 }
7664 else if (ch == '\r') {
7665 *p++ = '\\';
7666 *p++ = 'r';
7667 }
7668
7669 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007670 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007671 *p++ = '\\';
7672 *p++ = 'x';
7673 *p++ = hexdigits[(ch >> 4) & 0x000F];
7674 *p++ = hexdigits[ch & 0x000F];
7675 }
7676
Georg Brandl559e5d72008-06-11 18:37:52 +00007677 /* Copy ASCII characters as-is */
7678 else if (ch < 0x7F) {
7679 *p++ = ch;
7680 }
7681
7682 /* Non-ASCII characters */
7683 else {
7684 Py_UCS4 ucs = ch;
7685
7686#ifndef Py_UNICODE_WIDE
7687 Py_UNICODE ch2 = 0;
7688 /* Get code point from surrogate pair */
7689 if (size > 0) {
7690 ch2 = *s;
7691 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7692 && ch2 <= 0xDFFF) {
7693 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7694 + 0x00010000;
7695 s++;
7696 size--;
7697 }
7698 }
7699#endif
7700 /* Map Unicode whitespace and control characters
7701 (categories Z* and C* except ASCII space)
7702 */
7703 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7704 /* Map 8-bit characters to '\xhh' */
7705 if (ucs <= 0xff) {
7706 *p++ = '\\';
7707 *p++ = 'x';
7708 *p++ = hexdigits[(ch >> 4) & 0x000F];
7709 *p++ = hexdigits[ch & 0x000F];
7710 }
7711 /* Map 21-bit characters to '\U00xxxxxx' */
7712 else if (ucs >= 0x10000) {
7713 *p++ = '\\';
7714 *p++ = 'U';
7715 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7716 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7717 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7718 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7719 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7720 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7721 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7722 *p++ = hexdigits[ucs & 0x0000000F];
7723 }
7724 /* Map 16-bit characters to '\uxxxx' */
7725 else {
7726 *p++ = '\\';
7727 *p++ = 'u';
7728 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7729 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7730 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7731 *p++ = hexdigits[ucs & 0x000F];
7732 }
7733 }
7734 /* Copy characters as-is */
7735 else {
7736 *p++ = ch;
7737#ifndef Py_UNICODE_WIDE
7738 if (ucs >= 0x10000)
7739 *p++ = ch2;
7740#endif
7741 }
7742 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007743 }
7744 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007745 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007746
7747 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007748 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007749 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750}
7751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007752PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007753"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754\n\
7755Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007756such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757arguments start and end are interpreted as in slice notation.\n\
7758\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
7761static PyObject *
7762unicode_rfind(PyUnicodeObject *self, PyObject *args)
7763{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007764 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007765 Py_ssize_t start;
7766 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007767 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768
Christian Heimes9cd17752007-11-18 19:35:23 +00007769 if (!_ParseTupleFinds(args, &substring, &start, &end))
7770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
Thomas Wouters477c8d52006-05-27 19:21:47 +00007772 result = stringlib_rfind_slice(
7773 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7774 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7775 start, end
7776 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
7778 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007779
Christian Heimes217cfd12007-12-02 14:31:20 +00007780 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781}
7782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007783PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007784"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
7788static PyObject *
7789unicode_rindex(PyUnicodeObject *self, PyObject *args)
7790{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007791 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007792 Py_ssize_t start;
7793 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007794 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
Christian Heimes9cd17752007-11-18 19:35:23 +00007796 if (!_ParseTupleFinds(args, &substring, &start, &end))
7797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
Thomas Wouters477c8d52006-05-27 19:21:47 +00007799 result = stringlib_rfind_slice(
7800 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7801 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7802 start, end
7803 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007806
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 if (result < 0) {
7808 PyErr_SetString(PyExc_ValueError, "substring not found");
7809 return NULL;
7810 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007811 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812}
7813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007814PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007815"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007817Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007818done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
7820static PyObject *
7821unicode_rjust(PyUnicodeObject *self, PyObject *args)
7822{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007823 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007824 Py_UNICODE fillchar = ' ';
7825
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007826 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 return NULL;
7828
Tim Peters7a29bd52001-09-12 03:03:31 +00007829 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 Py_INCREF(self);
7831 return (PyObject*) self;
7832 }
7833
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007834 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835}
7836
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837PyObject *PyUnicode_Split(PyObject *s,
7838 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007839 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840{
7841 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007842
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 s = PyUnicode_FromObject(s);
7844 if (s == NULL)
7845 return NULL;
7846 if (sep != NULL) {
7847 sep = PyUnicode_FromObject(sep);
7848 if (sep == NULL) {
7849 Py_DECREF(s);
7850 return NULL;
7851 }
7852 }
7853
7854 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7855
7856 Py_DECREF(s);
7857 Py_XDECREF(sep);
7858 return result;
7859}
7860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007861PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007862"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863\n\
7864Return a list of the words in S, using sep as the\n\
7865delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007866splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007867whitespace string is a separator and empty strings are\n\
7868removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869
7870static PyObject*
7871unicode_split(PyUnicodeObject *self, PyObject *args)
7872{
7873 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007874 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Martin v. Löwis18e16552006-02-15 17:27:45 +00007876 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 return NULL;
7878
7879 if (substring == Py_None)
7880 return split(self, NULL, maxcount);
7881 else if (PyUnicode_Check(substring))
7882 return split(self, (PyUnicodeObject *)substring, maxcount);
7883 else
7884 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7885}
7886
Thomas Wouters477c8d52006-05-27 19:21:47 +00007887PyObject *
7888PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7889{
7890 PyObject* str_obj;
7891 PyObject* sep_obj;
7892 PyObject* out;
7893
7894 str_obj = PyUnicode_FromObject(str_in);
7895 if (!str_obj)
7896 return NULL;
7897 sep_obj = PyUnicode_FromObject(sep_in);
7898 if (!sep_obj) {
7899 Py_DECREF(str_obj);
7900 return NULL;
7901 }
7902
7903 out = stringlib_partition(
7904 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7905 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7906 );
7907
7908 Py_DECREF(sep_obj);
7909 Py_DECREF(str_obj);
7910
7911 return out;
7912}
7913
7914
7915PyObject *
7916PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7917{
7918 PyObject* str_obj;
7919 PyObject* sep_obj;
7920 PyObject* out;
7921
7922 str_obj = PyUnicode_FromObject(str_in);
7923 if (!str_obj)
7924 return NULL;
7925 sep_obj = PyUnicode_FromObject(sep_in);
7926 if (!sep_obj) {
7927 Py_DECREF(str_obj);
7928 return NULL;
7929 }
7930
7931 out = stringlib_rpartition(
7932 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7933 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7934 );
7935
7936 Py_DECREF(sep_obj);
7937 Py_DECREF(str_obj);
7938
7939 return out;
7940}
7941
7942PyDoc_STRVAR(partition__doc__,
7943"S.partition(sep) -> (head, sep, tail)\n\
7944\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007945Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007946the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007947found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007948
7949static PyObject*
7950unicode_partition(PyUnicodeObject *self, PyObject *separator)
7951{
7952 return PyUnicode_Partition((PyObject *)self, separator);
7953}
7954
7955PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007956"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007957\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007958Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007959the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007960separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007961
7962static PyObject*
7963unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7964{
7965 return PyUnicode_RPartition((PyObject *)self, separator);
7966}
7967
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007968PyObject *PyUnicode_RSplit(PyObject *s,
7969 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007971{
7972 PyObject *result;
7973
7974 s = PyUnicode_FromObject(s);
7975 if (s == NULL)
7976 return NULL;
7977 if (sep != NULL) {
7978 sep = PyUnicode_FromObject(sep);
7979 if (sep == NULL) {
7980 Py_DECREF(s);
7981 return NULL;
7982 }
7983 }
7984
7985 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7986
7987 Py_DECREF(s);
7988 Py_XDECREF(sep);
7989 return result;
7990}
7991
7992PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007993"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007994\n\
7995Return a list of the words in S, using sep as the\n\
7996delimiter string, starting at the end of the string and\n\
7997working to the front. If maxsplit is given, at most maxsplit\n\
7998splits are done. If sep is not specified, any whitespace string\n\
7999is a separator.");
8000
8001static PyObject*
8002unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8003{
8004 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008006
Martin v. Löwis18e16552006-02-15 17:27:45 +00008007 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008008 return NULL;
8009
8010 if (substring == Py_None)
8011 return rsplit(self, NULL, maxcount);
8012 else if (PyUnicode_Check(substring))
8013 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8014 else
8015 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8016}
8017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson4469d0c2008-11-30 22:46:23 +00008019"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020\n\
8021Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008022Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008023is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
8025static PyObject*
8026unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8027{
Guido van Rossum86662912000-04-11 15:38:46 +00008028 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029
Guido van Rossum86662912000-04-11 15:38:46 +00008030 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 return NULL;
8032
Guido van Rossum86662912000-04-11 15:38:46 +00008033 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034}
8035
8036static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008037PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Walter Dörwald346737f2007-05-31 10:44:43 +00008039 if (PyUnicode_CheckExact(self)) {
8040 Py_INCREF(self);
8041 return self;
8042 } else
8043 /* Subtype -- return genuine unicode string with the same value. */
8044 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8045 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046}
8047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008048PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008049"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050\n\
8051Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008052and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
8054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008055unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 return fixup(self, fixswapcase);
8058}
8059
Georg Brandlceee0772007-11-27 23:48:05 +00008060PyDoc_STRVAR(maketrans__doc__,
8061"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8062\n\
8063Return a translation table usable for str.translate().\n\
8064If there is only one argument, it must be a dictionary mapping Unicode\n\
8065ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008066Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008067If there are two arguments, they must be strings of equal length, and\n\
8068in the resulting dictionary, each character in x will be mapped to the\n\
8069character at the same position in y. If there is a third argument, it\n\
8070must be a string, whose characters will be mapped to None in the result.");
8071
8072static PyObject*
8073unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8074{
8075 PyObject *x, *y = NULL, *z = NULL;
8076 PyObject *new = NULL, *key, *value;
8077 Py_ssize_t i = 0;
8078 int res;
8079
8080 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8081 return NULL;
8082 new = PyDict_New();
8083 if (!new)
8084 return NULL;
8085 if (y != NULL) {
8086 /* x must be a string too, of equal length */
8087 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8088 if (!PyUnicode_Check(x)) {
8089 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8090 "be a string if there is a second argument");
8091 goto err;
8092 }
8093 if (PyUnicode_GET_SIZE(x) != ylen) {
8094 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8095 "arguments must have equal length");
8096 goto err;
8097 }
8098 /* create entries for translating chars in x to those in y */
8099 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008100 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8101 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008102 if (!key || !value)
8103 goto err;
8104 res = PyDict_SetItem(new, key, value);
8105 Py_DECREF(key);
8106 Py_DECREF(value);
8107 if (res < 0)
8108 goto err;
8109 }
8110 /* create entries for deleting chars in z */
8111 if (z != NULL) {
8112 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008113 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008114 if (!key)
8115 goto err;
8116 res = PyDict_SetItem(new, key, Py_None);
8117 Py_DECREF(key);
8118 if (res < 0)
8119 goto err;
8120 }
8121 }
8122 } else {
8123 /* x must be a dict */
8124 if (!PyDict_Check(x)) {
8125 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8126 "to maketrans it must be a dict");
8127 goto err;
8128 }
8129 /* copy entries into the new dict, converting string keys to int keys */
8130 while (PyDict_Next(x, &i, &key, &value)) {
8131 if (PyUnicode_Check(key)) {
8132 /* convert string keys to integer keys */
8133 PyObject *newkey;
8134 if (PyUnicode_GET_SIZE(key) != 1) {
8135 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8136 "table must be of length 1");
8137 goto err;
8138 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008139 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008140 if (!newkey)
8141 goto err;
8142 res = PyDict_SetItem(new, newkey, value);
8143 Py_DECREF(newkey);
8144 if (res < 0)
8145 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008146 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008147 /* just keep integer keys */
8148 if (PyDict_SetItem(new, key, value) < 0)
8149 goto err;
8150 } else {
8151 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8152 "be strings or integers");
8153 goto err;
8154 }
8155 }
8156 }
8157 return new;
8158 err:
8159 Py_DECREF(new);
8160 return NULL;
8161}
8162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008163PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008164"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165\n\
8166Return a copy of the string S, where all characters have been mapped\n\
8167through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008168Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008169Unmapped characters are left untouched. Characters mapped to None\n\
8170are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
8172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008173unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174{
Georg Brandlceee0772007-11-27 23:48:05 +00008175 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176}
8177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008178PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008179"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008181Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
8183static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008184unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 return fixup(self, fixupper);
8187}
8188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008189PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008190"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008192Pad a numeric string S with zeros on the left, to fill a field\n\
8193of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
8195static PyObject *
8196unicode_zfill(PyUnicodeObject *self, PyObject *args)
8197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008198 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 PyUnicodeObject *u;
8200
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201 Py_ssize_t width;
8202 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 return NULL;
8204
8205 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008206 if (PyUnicode_CheckExact(self)) {
8207 Py_INCREF(self);
8208 return (PyObject*) self;
8209 }
8210 else
8211 return PyUnicode_FromUnicode(
8212 PyUnicode_AS_UNICODE(self),
8213 PyUnicode_GET_SIZE(self)
8214 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 }
8216
8217 fill = width - self->length;
8218
8219 u = pad(self, fill, 0, '0');
8220
Walter Dörwald068325e2002-04-15 13:36:47 +00008221 if (u == NULL)
8222 return NULL;
8223
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 if (u->str[fill] == '+' || u->str[fill] == '-') {
8225 /* move sign to beginning of string */
8226 u->str[0] = u->str[fill];
8227 u->str[fill] = '0';
8228 }
8229
8230 return (PyObject*) u;
8231}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
8233#if 0
8234static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008235unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Christian Heimes2202f872008-02-06 14:31:34 +00008237 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238}
8239#endif
8240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008241PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008242"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008244Return True if S starts with the specified prefix, False otherwise.\n\
8245With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008246With optional end, stop comparing S at that position.\n\
8247prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248
8249static PyObject *
8250unicode_startswith(PyUnicodeObject *self,
8251 PyObject *args)
8252{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008253 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008256 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008257 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008259 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008260 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008262 if (PyTuple_Check(subobj)) {
8263 Py_ssize_t i;
8264 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8265 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8266 PyTuple_GET_ITEM(subobj, i));
8267 if (substring == NULL)
8268 return NULL;
8269 result = tailmatch(self, substring, start, end, -1);
8270 Py_DECREF(substring);
8271 if (result) {
8272 Py_RETURN_TRUE;
8273 }
8274 }
8275 /* nothing matched */
8276 Py_RETURN_FALSE;
8277 }
8278 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008280 return NULL;
8281 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008283 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284}
8285
8286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008287PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008288"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008290Return True if S ends with the specified suffix, False otherwise.\n\
8291With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008292With optional end, stop comparing S at that position.\n\
8293suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
8295static PyObject *
8296unicode_endswith(PyUnicodeObject *self,
8297 PyObject *args)
8298{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008299 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008302 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008303 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008305 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8306 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008308 if (PyTuple_Check(subobj)) {
8309 Py_ssize_t i;
8310 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8311 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8312 PyTuple_GET_ITEM(subobj, i));
8313 if (substring == NULL)
8314 return NULL;
8315 result = tailmatch(self, substring, start, end, +1);
8316 Py_DECREF(substring);
8317 if (result) {
8318 Py_RETURN_TRUE;
8319 }
8320 }
8321 Py_RETURN_FALSE;
8322 }
8323 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008327 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008329 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Eric Smith8c663262007-08-25 02:26:07 +00008332#include "stringlib/string_format.h"
8333
8334PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008335"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008336\n\
8337");
8338
Eric Smith4a7d76d2008-05-30 18:10:19 +00008339static PyObject *
8340unicode__format__(PyObject* self, PyObject* args)
8341{
8342 PyObject *format_spec;
8343
8344 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8345 return NULL;
8346
8347 return _PyUnicode_FormatAdvanced(self,
8348 PyUnicode_AS_UNICODE(format_spec),
8349 PyUnicode_GET_SIZE(format_spec));
8350}
8351
Eric Smith8c663262007-08-25 02:26:07 +00008352PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008353"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008354\n\
8355");
8356
8357static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008358unicode__sizeof__(PyUnicodeObject *v)
8359{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008360 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8361 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008362}
8363
8364PyDoc_STRVAR(sizeof__doc__,
8365"S.__sizeof__() -> size of S in memory, in bytes");
8366
8367static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008368unicode_getnewargs(PyUnicodeObject *v)
8369{
8370 return Py_BuildValue("(u#)", v->str, v->length);
8371}
8372
8373
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374static PyMethodDef unicode_methods[] = {
8375
8376 /* Order is according to common usage: often used methods should
8377 appear first, since lookup is done sequentially. */
8378
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008379 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8380 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8381 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008382 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008383 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8384 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8385 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8386 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8387 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8388 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8389 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008390 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008391 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8392 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8393 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008394 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008395 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8396 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8397 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008398 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008399 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008400 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008401 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008402 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8403 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8404 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8405 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8406 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8407 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8408 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8409 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8410 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8411 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8412 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8413 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8414 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8415 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008416 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008417 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008418 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008419 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008420 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008421 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8422 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008423 {"maketrans", (PyCFunction) unicode_maketrans,
8424 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008425 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008426#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008427 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428#endif
8429
8430#if 0
8431 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008432 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433#endif
8434
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008435 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 {NULL, NULL}
8437};
8438
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008439static PyObject *
8440unicode_mod(PyObject *v, PyObject *w)
8441{
8442 if (!PyUnicode_Check(v)) {
8443 Py_INCREF(Py_NotImplemented);
8444 return Py_NotImplemented;
8445 }
8446 return PyUnicode_Format(v, w);
8447}
8448
8449static PyNumberMethods unicode_as_number = {
8450 0, /*nb_add*/
8451 0, /*nb_subtract*/
8452 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008453 unicode_mod, /*nb_remainder*/
8454};
8455
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008457 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008458 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008459 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8460 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008461 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 0, /* sq_ass_item */
8463 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008464 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465};
8466
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008467static PyObject*
8468unicode_subscript(PyUnicodeObject* self, PyObject* item)
8469{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008470 if (PyIndex_Check(item)) {
8471 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008472 if (i == -1 && PyErr_Occurred())
8473 return NULL;
8474 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008475 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008476 return unicode_getitem(self, i);
8477 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008479 Py_UNICODE* source_buf;
8480 Py_UNICODE* result_buf;
8481 PyObject* result;
8482
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008483 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008484 &start, &stop, &step, &slicelength) < 0) {
8485 return NULL;
8486 }
8487
8488 if (slicelength <= 0) {
8489 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008490 } else if (start == 0 && step == 1 && slicelength == self->length &&
8491 PyUnicode_CheckExact(self)) {
8492 Py_INCREF(self);
8493 return (PyObject *)self;
8494 } else if (step == 1) {
8495 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008496 } else {
8497 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008498 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8499 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008500
8501 if (result_buf == NULL)
8502 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008503
8504 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8505 result_buf[i] = source_buf[cur];
8506 }
Tim Petersced69f82003-09-16 20:30:58 +00008507
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008508 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008509 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008510 return result;
8511 }
8512 } else {
8513 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8514 return NULL;
8515 }
8516}
8517
8518static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008519 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008520 (binaryfunc)unicode_subscript, /* mp_subscript */
8521 (objobjargproc)0, /* mp_ass_subscript */
8522};
8523
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525/* Helpers for PyUnicode_Format() */
8526
8527static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008528getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008530 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 if (argidx < arglen) {
8532 (*p_argidx)++;
8533 if (arglen < 0)
8534 return args;
8535 else
8536 return PyTuple_GetItem(args, argidx);
8537 }
8538 PyErr_SetString(PyExc_TypeError,
8539 "not enough arguments for format string");
8540 return NULL;
8541}
8542
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008544strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008546 register Py_ssize_t i;
8547 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 for (i = len - 1; i >= 0; i--)
8549 buffer[i] = (Py_UNICODE) charbuffer[i];
8550
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 return len;
8552}
8553
Neal Norwitzfc76d632006-01-10 06:03:13 +00008554static int
8555doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8556{
Tim Peters15231542006-02-16 01:08:01 +00008557 Py_ssize_t result;
8558
Neal Norwitzfc76d632006-01-10 06:03:13 +00008559 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008560 result = strtounicode(buffer, (char *)buffer);
8561 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008562}
8563
Christian Heimes3fd13992008-03-21 01:05:49 +00008564#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008565static int
8566longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8567{
Tim Peters15231542006-02-16 01:08:01 +00008568 Py_ssize_t result;
8569
Neal Norwitzfc76d632006-01-10 06:03:13 +00008570 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008571 result = strtounicode(buffer, (char *)buffer);
8572 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008573}
Christian Heimes3fd13992008-03-21 01:05:49 +00008574#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008575
Guido van Rossum078151d2002-08-11 04:24:12 +00008576/* XXX To save some code duplication, formatfloat/long/int could have been
8577 shared with stringobject.c, converting from 8-bit to Unicode after the
8578 formatting is done. */
8579
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580static int
8581formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008582 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 int flags,
8584 int prec,
8585 int type,
8586 PyObject *v)
8587{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008588 /* fmt = '%#.' + `prec` + `type`
8589 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 char fmt[20];
8591 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008592
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 x = PyFloat_AsDouble(v);
8594 if (x == -1.0 && PyErr_Occurred())
8595 return -1;
8596 if (prec < 0)
8597 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008598 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8599 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008600 /* Worst case length calc to ensure no buffer overrun:
8601
8602 'g' formats:
8603 fmt = %#.<prec>g
8604 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8605 for any double rep.)
8606 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8607
8608 'f' formats:
8609 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8610 len = 1 + 50 + 1 + prec = 52 + prec
8611
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008612 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008613 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008614
8615 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008616 if (((type == 'g' || type == 'G') &&
8617 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008618 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008619 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008620 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008621 return -1;
8622 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008623 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8624 (flags&F_ALT) ? "#" : "",
8625 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008626 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Tim Peters38fd5b62000-09-21 05:43:11 +00008629static PyObject*
8630formatlong(PyObject *val, int flags, int prec, int type)
8631{
8632 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008633 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008634 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008635 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008636
Christian Heimes72b710a2008-05-26 13:28:38 +00008637 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008638 if (!str)
8639 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008640 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008641 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008642 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008643}
8644
Christian Heimes3fd13992008-03-21 01:05:49 +00008645#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646static int
8647formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008648 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 int flags,
8650 int prec,
8651 int type,
8652 PyObject *v)
8653{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008654 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008655 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8656 * + 1 + 1
8657 * = 24
8658 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008659 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008660 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 long x;
8662
Christian Heimes217cfd12007-12-02 14:31:20 +00008663 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008665 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008666 if (x < 0 && type == 'u') {
8667 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008668 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008669 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8670 sign = "-";
8671 else
8672 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008674 prec = 1;
8675
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008676 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8677 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008678 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008679 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008680 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008681 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008682 return -1;
8683 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008684
8685 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008686 (type == 'x' || type == 'X' || type == 'o')) {
8687 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008688 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008689 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008690 * - when 0 is being converted, the C standard leaves off
8691 * the '0x' or '0X', which is inconsistent with other
8692 * %#x/%#X conversions and inconsistent with Python's
8693 * hex() function
8694 * - there are platforms that violate the standard and
8695 * convert 0 with the '0x' or '0X'
8696 * (Metrowerks, Compaq Tru64)
8697 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008698 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008699 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008700 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008701 * We can achieve the desired consistency by inserting our
8702 * own '0x' or '0X' prefix, and substituting %x/%X in place
8703 * of %#x/%#X.
8704 *
8705 * Note that this is the same approach as used in
8706 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008707 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008708 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8709 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008710 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008711 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008712 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8713 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008714 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008715 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008716 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008717 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008718 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008719 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720}
Christian Heimes3fd13992008-03-21 01:05:49 +00008721#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722
8723static int
8724formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008725 size_t buflen,
8726 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008728 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008729 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008730 if (PyUnicode_GET_SIZE(v) == 1) {
8731 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8732 buf[1] = '\0';
8733 return 1;
8734 }
8735#ifndef Py_UNICODE_WIDE
8736 if (PyUnicode_GET_SIZE(v) == 2) {
8737 /* Decode a valid surrogate pair */
8738 int c0 = PyUnicode_AS_UNICODE(v)[0];
8739 int c1 = PyUnicode_AS_UNICODE(v)[1];
8740 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8741 0xDC00 <= c1 && c1 <= 0xDFFF) {
8742 buf[0] = c0;
8743 buf[1] = c1;
8744 buf[2] = '\0';
8745 return 2;
8746 }
8747 }
8748#endif
8749 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 else {
8752 /* Integer input truncated to a character */
8753 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008754 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008756 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008757
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008758 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008759 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008760 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008761 return -1;
8762 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008763
8764#ifndef Py_UNICODE_WIDE
8765 if (x > 0xffff) {
8766 x -= 0x10000;
8767 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8768 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8769 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008770 }
8771#endif
8772 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008773 buf[1] = '\0';
8774 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008776
8777 onError:
8778 PyErr_SetString(PyExc_TypeError,
8779 "%c requires int or char");
8780 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781}
8782
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008783/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8784
8785 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8786 chars are formatted. XXX This is a magic number. Each formatting
8787 routine does bounds checking to ensure no overflow, but a better
8788 solution may be to malloc a buffer of appropriate size for each
8789 format. For now, the current solution is sufficient.
8790*/
8791#define FORMATBUFLEN (size_t)120
8792
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793PyObject *PyUnicode_Format(PyObject *format,
8794 PyObject *args)
8795{
8796 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 int args_owned = 0;
8799 PyUnicodeObject *result = NULL;
8800 PyObject *dict = NULL;
8801 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008802
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 if (format == NULL || args == NULL) {
8804 PyErr_BadInternalCall();
8805 return NULL;
8806 }
8807 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008808 if (uformat == NULL)
8809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810 fmt = PyUnicode_AS_UNICODE(uformat);
8811 fmtcnt = PyUnicode_GET_SIZE(uformat);
8812
8813 reslen = rescnt = fmtcnt + 100;
8814 result = _PyUnicode_New(reslen);
8815 if (result == NULL)
8816 goto onError;
8817 res = PyUnicode_AS_UNICODE(result);
8818
8819 if (PyTuple_Check(args)) {
8820 arglen = PyTuple_Size(args);
8821 argidx = 0;
8822 }
8823 else {
8824 arglen = -1;
8825 argidx = -2;
8826 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008827 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008828 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 dict = args;
8830
8831 while (--fmtcnt >= 0) {
8832 if (*fmt != '%') {
8833 if (--rescnt < 0) {
8834 rescnt = fmtcnt + 100;
8835 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008836 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8839 --rescnt;
8840 }
8841 *res++ = *fmt++;
8842 }
8843 else {
8844 /* Got a format specifier */
8845 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008846 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 Py_UNICODE c = '\0';
8849 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008850 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 PyObject *v = NULL;
8852 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008853 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008855 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008856 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
8858 fmt++;
8859 if (*fmt == '(') {
8860 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008861 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 PyObject *key;
8863 int pcount = 1;
8864
8865 if (dict == NULL) {
8866 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008867 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 goto onError;
8869 }
8870 ++fmt;
8871 --fmtcnt;
8872 keystart = fmt;
8873 /* Skip over balanced parentheses */
8874 while (pcount > 0 && --fmtcnt >= 0) {
8875 if (*fmt == ')')
8876 --pcount;
8877 else if (*fmt == '(')
8878 ++pcount;
8879 fmt++;
8880 }
8881 keylen = fmt - keystart - 1;
8882 if (fmtcnt < 0 || pcount > 0) {
8883 PyErr_SetString(PyExc_ValueError,
8884 "incomplete format key");
8885 goto onError;
8886 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008887#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008888 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 then looked up since Python uses strings to hold
8890 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008891 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 key = PyUnicode_EncodeUTF8(keystart,
8893 keylen,
8894 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008895#else
8896 key = PyUnicode_FromUnicode(keystart, keylen);
8897#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898 if (key == NULL)
8899 goto onError;
8900 if (args_owned) {
8901 Py_DECREF(args);
8902 args_owned = 0;
8903 }
8904 args = PyObject_GetItem(dict, key);
8905 Py_DECREF(key);
8906 if (args == NULL) {
8907 goto onError;
8908 }
8909 args_owned = 1;
8910 arglen = -1;
8911 argidx = -2;
8912 }
8913 while (--fmtcnt >= 0) {
8914 switch (c = *fmt++) {
8915 case '-': flags |= F_LJUST; continue;
8916 case '+': flags |= F_SIGN; continue;
8917 case ' ': flags |= F_BLANK; continue;
8918 case '#': flags |= F_ALT; continue;
8919 case '0': flags |= F_ZERO; continue;
8920 }
8921 break;
8922 }
8923 if (c == '*') {
8924 v = getnextarg(args, arglen, &argidx);
8925 if (v == NULL)
8926 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008927 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 PyErr_SetString(PyExc_TypeError,
8929 "* wants int");
8930 goto onError;
8931 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008932 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008933 if (width == -1 && PyErr_Occurred())
8934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 if (width < 0) {
8936 flags |= F_LJUST;
8937 width = -width;
8938 }
8939 if (--fmtcnt >= 0)
8940 c = *fmt++;
8941 }
8942 else if (c >= '0' && c <= '9') {
8943 width = c - '0';
8944 while (--fmtcnt >= 0) {
8945 c = *fmt++;
8946 if (c < '0' || c > '9')
8947 break;
8948 if ((width*10) / 10 != width) {
8949 PyErr_SetString(PyExc_ValueError,
8950 "width too big");
8951 goto onError;
8952 }
8953 width = width*10 + (c - '0');
8954 }
8955 }
8956 if (c == '.') {
8957 prec = 0;
8958 if (--fmtcnt >= 0)
8959 c = *fmt++;
8960 if (c == '*') {
8961 v = getnextarg(args, arglen, &argidx);
8962 if (v == NULL)
8963 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008964 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 PyErr_SetString(PyExc_TypeError,
8966 "* wants int");
8967 goto onError;
8968 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008969 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008970 if (prec == -1 && PyErr_Occurred())
8971 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (prec < 0)
8973 prec = 0;
8974 if (--fmtcnt >= 0)
8975 c = *fmt++;
8976 }
8977 else if (c >= '0' && c <= '9') {
8978 prec = c - '0';
8979 while (--fmtcnt >= 0) {
8980 c = Py_CHARMASK(*fmt++);
8981 if (c < '0' || c > '9')
8982 break;
8983 if ((prec*10) / 10 != prec) {
8984 PyErr_SetString(PyExc_ValueError,
8985 "prec too big");
8986 goto onError;
8987 }
8988 prec = prec*10 + (c - '0');
8989 }
8990 }
8991 } /* prec */
8992 if (fmtcnt >= 0) {
8993 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 if (--fmtcnt >= 0)
8995 c = *fmt++;
8996 }
8997 }
8998 if (fmtcnt < 0) {
8999 PyErr_SetString(PyExc_ValueError,
9000 "incomplete format");
9001 goto onError;
9002 }
9003 if (c != '%') {
9004 v = getnextarg(args, arglen, &argidx);
9005 if (v == NULL)
9006 goto onError;
9007 }
9008 sign = 0;
9009 fill = ' ';
9010 switch (c) {
9011
9012 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009013 pbuf = formatbuf;
9014 /* presume that buffer length is at least 1 */
9015 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 len = 1;
9017 break;
9018
9019 case 's':
9020 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009021 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (PyUnicode_Check(v) && c == 's') {
9023 temp = v;
9024 Py_INCREF(temp);
9025 }
9026 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009028 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009029 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009031 else
9032 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 if (temp == NULL)
9034 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009035 if (PyUnicode_Check(temp))
9036 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009037 else {
9038 Py_DECREF(temp);
9039 PyErr_SetString(PyExc_TypeError,
9040 "%s argument has non-string str()");
9041 goto onError;
9042 }
9043 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009044 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 len = PyUnicode_GET_SIZE(temp);
9046 if (prec >= 0 && len > prec)
9047 len = prec;
9048 break;
9049
9050 case 'i':
9051 case 'd':
9052 case 'u':
9053 case 'o':
9054 case 'x':
9055 case 'X':
9056 if (c == 'i')
9057 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009058 isnumok = 0;
9059 if (PyNumber_Check(v)) {
9060 PyObject *iobj=NULL;
9061
9062 if (PyLong_Check(v)) {
9063 iobj = v;
9064 Py_INCREF(iobj);
9065 }
9066 else {
9067 iobj = PyNumber_Long(v);
9068 }
9069 if (iobj!=NULL) {
9070 if (PyLong_Check(iobj)) {
9071 isnumok = 1;
9072 temp = formatlong(iobj, flags, prec, c);
9073 Py_DECREF(iobj);
9074 if (!temp)
9075 goto onError;
9076 pbuf = PyUnicode_AS_UNICODE(temp);
9077 len = PyUnicode_GET_SIZE(temp);
9078 sign = 1;
9079 }
9080 else {
9081 Py_DECREF(iobj);
9082 }
9083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009085 if (!isnumok) {
9086 PyErr_Format(PyExc_TypeError,
9087 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009088 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009089 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009090 }
9091 if (flags & F_ZERO)
9092 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 break;
9094
9095 case 'e':
9096 case 'E':
9097 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009098 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 case 'g':
9100 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009101 if (c == 'F')
9102 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009103 pbuf = formatbuf;
9104 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9105 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106 if (len < 0)
9107 goto onError;
9108 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009109 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 fill = '0';
9111 break;
9112
9113 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009114 pbuf = formatbuf;
9115 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116 if (len < 0)
9117 goto onError;
9118 break;
9119
9120 default:
9121 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009122 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009123 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009124 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009125 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009126 (Py_ssize_t)(fmt - 1 -
9127 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 goto onError;
9129 }
9130 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009131 if (*pbuf == '-' || *pbuf == '+') {
9132 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 len--;
9134 }
9135 else if (flags & F_SIGN)
9136 sign = '+';
9137 else if (flags & F_BLANK)
9138 sign = ' ';
9139 else
9140 sign = 0;
9141 }
9142 if (width < len)
9143 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009144 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 reslen -= rescnt;
9146 rescnt = width + fmtcnt + 100;
9147 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009148 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009149 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009150 PyErr_NoMemory();
9151 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009152 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009153 if (_PyUnicode_Resize(&result, reslen) < 0) {
9154 Py_XDECREF(temp);
9155 goto onError;
9156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 res = PyUnicode_AS_UNICODE(result)
9158 + reslen - rescnt;
9159 }
9160 if (sign) {
9161 if (fill != ' ')
9162 *res++ = sign;
9163 rescnt--;
9164 if (width > len)
9165 width--;
9166 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009167 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009168 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009169 assert(pbuf[1] == c);
9170 if (fill != ' ') {
9171 *res++ = *pbuf++;
9172 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009173 }
Tim Petersfff53252001-04-12 18:38:48 +00009174 rescnt -= 2;
9175 width -= 2;
9176 if (width < 0)
9177 width = 0;
9178 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 if (width > len && !(flags & F_LJUST)) {
9181 do {
9182 --rescnt;
9183 *res++ = fill;
9184 } while (--width > len);
9185 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009186 if (fill == ' ') {
9187 if (sign)
9188 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009189 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009190 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009191 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009192 *res++ = *pbuf++;
9193 *res++ = *pbuf++;
9194 }
9195 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009196 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 res += len;
9198 rescnt -= len;
9199 while (--width >= len) {
9200 --rescnt;
9201 *res++ = ' ';
9202 }
9203 if (dict && (argidx < arglen) && c != '%') {
9204 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009205 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009206 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 goto onError;
9208 }
9209 Py_XDECREF(temp);
9210 } /* '%' */
9211 } /* until end */
9212 if (argidx < arglen && !dict) {
9213 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009214 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 goto onError;
9216 }
9217
Thomas Woutersa96affe2006-03-12 00:29:36 +00009218 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9219 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 if (args_owned) {
9221 Py_DECREF(args);
9222 }
9223 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224 return (PyObject *)result;
9225
9226 onError:
9227 Py_XDECREF(result);
9228 Py_DECREF(uformat);
9229 if (args_owned) {
9230 Py_DECREF(args);
9231 }
9232 return NULL;
9233}
9234
Jeremy Hylton938ace62002-07-17 16:30:39 +00009235static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009236unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9237
Tim Peters6d6c1a32001-08-02 04:15:00 +00009238static PyObject *
9239unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9240{
9241 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009242 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009243 char *encoding = NULL;
9244 char *errors = NULL;
9245
Guido van Rossume023fe02001-08-30 03:12:59 +00009246 if (type != &PyUnicode_Type)
9247 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009248 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009249 kwlist, &x, &encoding, &errors))
9250 return NULL;
9251 if (x == NULL)
9252 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009253 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009254 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009255 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009256 return PyUnicode_FromEncodedObject(x, encoding, errors);
9257}
9258
Guido van Rossume023fe02001-08-30 03:12:59 +00009259static PyObject *
9260unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9261{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009262 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009263 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009264
9265 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9266 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9267 if (tmp == NULL)
9268 return NULL;
9269 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009270 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009271 if (pnew == NULL) {
9272 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009273 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009274 }
Christian Heimesb186d002008-03-18 15:15:01 +00009275 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009276 if (pnew->str == NULL) {
9277 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009278 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009279 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009280 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009281 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009282 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9283 pnew->length = n;
9284 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009285 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009286 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009287}
9288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009289PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009290"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009291\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009292Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009293encoding defaults to the current default string encoding.\n\
9294errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009295
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009296static PyObject *unicode_iter(PyObject *seq);
9297
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009299 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009300 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 sizeof(PyUnicodeObject), /* tp_size */
9302 0, /* tp_itemsize */
9303 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009304 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009306 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009308 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009309 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009310 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009312 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 (hashfunc) unicode_hash, /* tp_hash*/
9314 0, /* tp_call*/
9315 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009316 PyObject_GenericGetAttr, /* tp_getattro */
9317 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009318 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009319 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9320 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009321 unicode_doc, /* tp_doc */
9322 0, /* tp_traverse */
9323 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009324 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009325 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009326 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009327 0, /* tp_iternext */
9328 unicode_methods, /* tp_methods */
9329 0, /* tp_members */
9330 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009331 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009332 0, /* tp_dict */
9333 0, /* tp_descr_get */
9334 0, /* tp_descr_set */
9335 0, /* tp_dictoffset */
9336 0, /* tp_init */
9337 0, /* tp_alloc */
9338 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009339 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340};
9341
9342/* Initialize the Unicode implementation */
9343
Thomas Wouters78890102000-07-22 19:25:51 +00009344void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009346 int i;
9347
Thomas Wouters477c8d52006-05-27 19:21:47 +00009348 /* XXX - move this array to unicodectype.c ? */
9349 Py_UNICODE linebreak[] = {
9350 0x000A, /* LINE FEED */
9351 0x000D, /* CARRIAGE RETURN */
9352 0x001C, /* FILE SEPARATOR */
9353 0x001D, /* GROUP SEPARATOR */
9354 0x001E, /* RECORD SEPARATOR */
9355 0x0085, /* NEXT LINE */
9356 0x2028, /* LINE SEPARATOR */
9357 0x2029, /* PARAGRAPH SEPARATOR */
9358 };
9359
Fred Drakee4315f52000-05-09 19:53:39 +00009360 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009361 free_list = NULL;
9362 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009364 if (!unicode_empty)
9365 return;
9366
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009367 for (i = 0; i < 256; i++)
9368 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009369 if (PyType_Ready(&PyUnicode_Type) < 0)
9370 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009371
9372 /* initialize the linebreak bloom filter */
9373 bloom_linebreak = make_bloom_mask(
9374 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9375 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009376
9377 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378}
9379
9380/* Finalize the Unicode implementation */
9381
Christian Heimesa156e092008-02-16 07:38:31 +00009382int
9383PyUnicode_ClearFreeList(void)
9384{
9385 int freelist_size = numfree;
9386 PyUnicodeObject *u;
9387
9388 for (u = free_list; u != NULL;) {
9389 PyUnicodeObject *v = u;
9390 u = *(PyUnicodeObject **)u;
9391 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009392 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009393 Py_XDECREF(v->defenc);
9394 PyObject_Del(v);
9395 numfree--;
9396 }
9397 free_list = NULL;
9398 assert(numfree == 0);
9399 return freelist_size;
9400}
9401
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402void
Thomas Wouters78890102000-07-22 19:25:51 +00009403_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009405 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009407 Py_XDECREF(unicode_empty);
9408 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009409
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009410 for (i = 0; i < 256; i++) {
9411 if (unicode_latin1[i]) {
9412 Py_DECREF(unicode_latin1[i]);
9413 unicode_latin1[i] = NULL;
9414 }
9415 }
Christian Heimesa156e092008-02-16 07:38:31 +00009416 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009418
Walter Dörwald16807132007-05-25 13:52:07 +00009419void
9420PyUnicode_InternInPlace(PyObject **p)
9421{
9422 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9423 PyObject *t;
9424 if (s == NULL || !PyUnicode_Check(s))
9425 Py_FatalError(
9426 "PyUnicode_InternInPlace: unicode strings only please!");
9427 /* If it's a subclass, we don't really know what putting
9428 it in the interned dict might do. */
9429 if (!PyUnicode_CheckExact(s))
9430 return;
9431 if (PyUnicode_CHECK_INTERNED(s))
9432 return;
9433 if (interned == NULL) {
9434 interned = PyDict_New();
9435 if (interned == NULL) {
9436 PyErr_Clear(); /* Don't leave an exception */
9437 return;
9438 }
9439 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009440 /* It might be that the GetItem call fails even
9441 though the key is present in the dictionary,
9442 namely when this happens during a stack overflow. */
9443 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009444 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009445 Py_END_ALLOW_RECURSION
9446
Walter Dörwald16807132007-05-25 13:52:07 +00009447 if (t) {
9448 Py_INCREF(t);
9449 Py_DECREF(*p);
9450 *p = t;
9451 return;
9452 }
9453
Martin v. Löwis5b222132007-06-10 09:51:05 +00009454 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009455 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9456 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009457 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009458 return;
9459 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009460 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009461 /* The two references in interned are not counted by refcnt.
9462 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009463 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009464 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9465}
9466
9467void
9468PyUnicode_InternImmortal(PyObject **p)
9469{
9470 PyUnicode_InternInPlace(p);
9471 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9472 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9473 Py_INCREF(*p);
9474 }
9475}
9476
9477PyObject *
9478PyUnicode_InternFromString(const char *cp)
9479{
9480 PyObject *s = PyUnicode_FromString(cp);
9481 if (s == NULL)
9482 return NULL;
9483 PyUnicode_InternInPlace(&s);
9484 return s;
9485}
9486
9487void _Py_ReleaseInternedUnicodeStrings(void)
9488{
9489 PyObject *keys;
9490 PyUnicodeObject *s;
9491 Py_ssize_t i, n;
9492 Py_ssize_t immortal_size = 0, mortal_size = 0;
9493
9494 if (interned == NULL || !PyDict_Check(interned))
9495 return;
9496 keys = PyDict_Keys(interned);
9497 if (keys == NULL || !PyList_Check(keys)) {
9498 PyErr_Clear();
9499 return;
9500 }
9501
9502 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9503 detector, interned unicode strings are not forcibly deallocated;
9504 rather, we give them their stolen references back, and then clear
9505 and DECREF the interned dict. */
9506
9507 n = PyList_GET_SIZE(keys);
9508 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9509 n);
9510 for (i = 0; i < n; i++) {
9511 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9512 switch (s->state) {
9513 case SSTATE_NOT_INTERNED:
9514 /* XXX Shouldn't happen */
9515 break;
9516 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009517 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009518 immortal_size += s->length;
9519 break;
9520 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009521 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009522 mortal_size += s->length;
9523 break;
9524 default:
9525 Py_FatalError("Inconsistent interned string state.");
9526 }
9527 s->state = SSTATE_NOT_INTERNED;
9528 }
9529 fprintf(stderr, "total size of all interned strings: "
9530 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9531 "mortal/immortal\n", mortal_size, immortal_size);
9532 Py_DECREF(keys);
9533 PyDict_Clear(interned);
9534 Py_DECREF(interned);
9535 interned = NULL;
9536}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009537
9538
9539/********************* Unicode Iterator **************************/
9540
9541typedef struct {
9542 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009543 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009544 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9545} unicodeiterobject;
9546
9547static void
9548unicodeiter_dealloc(unicodeiterobject *it)
9549{
9550 _PyObject_GC_UNTRACK(it);
9551 Py_XDECREF(it->it_seq);
9552 PyObject_GC_Del(it);
9553}
9554
9555static int
9556unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9557{
9558 Py_VISIT(it->it_seq);
9559 return 0;
9560}
9561
9562static PyObject *
9563unicodeiter_next(unicodeiterobject *it)
9564{
9565 PyUnicodeObject *seq;
9566 PyObject *item;
9567
9568 assert(it != NULL);
9569 seq = it->it_seq;
9570 if (seq == NULL)
9571 return NULL;
9572 assert(PyUnicode_Check(seq));
9573
9574 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009575 item = PyUnicode_FromUnicode(
9576 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009577 if (item != NULL)
9578 ++it->it_index;
9579 return item;
9580 }
9581
9582 Py_DECREF(seq);
9583 it->it_seq = NULL;
9584 return NULL;
9585}
9586
9587static PyObject *
9588unicodeiter_len(unicodeiterobject *it)
9589{
9590 Py_ssize_t len = 0;
9591 if (it->it_seq)
9592 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009593 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009594}
9595
9596PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9597
9598static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009599 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9600 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009601 {NULL, NULL} /* sentinel */
9602};
9603
9604PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009605 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009606 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009607 sizeof(unicodeiterobject), /* tp_basicsize */
9608 0, /* tp_itemsize */
9609 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009610 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009611 0, /* tp_print */
9612 0, /* tp_getattr */
9613 0, /* tp_setattr */
9614 0, /* tp_compare */
9615 0, /* tp_repr */
9616 0, /* tp_as_number */
9617 0, /* tp_as_sequence */
9618 0, /* tp_as_mapping */
9619 0, /* tp_hash */
9620 0, /* tp_call */
9621 0, /* tp_str */
9622 PyObject_GenericGetAttr, /* tp_getattro */
9623 0, /* tp_setattro */
9624 0, /* tp_as_buffer */
9625 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9626 0, /* tp_doc */
9627 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9628 0, /* tp_clear */
9629 0, /* tp_richcompare */
9630 0, /* tp_weaklistoffset */
9631 PyObject_SelfIter, /* tp_iter */
9632 (iternextfunc)unicodeiter_next, /* tp_iternext */
9633 unicodeiter_methods, /* tp_methods */
9634 0,
9635};
9636
9637static PyObject *
9638unicode_iter(PyObject *seq)
9639{
9640 unicodeiterobject *it;
9641
9642 if (!PyUnicode_Check(seq)) {
9643 PyErr_BadInternalCall();
9644 return NULL;
9645 }
9646 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9647 if (it == NULL)
9648 return NULL;
9649 it->it_index = 0;
9650 Py_INCREF(seq);
9651 it->it_seq = (PyUnicodeObject *)seq;
9652 _PyObject_GC_TRACK(it);
9653 return (PyObject *)it;
9654}
9655
Martin v. Löwis5b222132007-06-10 09:51:05 +00009656size_t
9657Py_UNICODE_strlen(const Py_UNICODE *u)
9658{
9659 int res = 0;
9660 while(*u++)
9661 res++;
9662 return res;
9663}
9664
9665Py_UNICODE*
9666Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9667{
9668 Py_UNICODE *u = s1;
9669 while ((*u++ = *s2++));
9670 return s1;
9671}
9672
9673Py_UNICODE*
9674Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9675{
9676 Py_UNICODE *u = s1;
9677 while ((*u++ = *s2++))
9678 if (n-- == 0)
9679 break;
9680 return s1;
9681}
9682
9683int
9684Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9685{
9686 while (*s1 && *s2 && *s1 == *s2)
9687 s1++, s2++;
9688 if (*s1 && *s2)
9689 return (*s1 < *s2) ? -1 : +1;
9690 if (*s1)
9691 return 1;
9692 if (*s2)
9693 return -1;
9694 return 0;
9695}
9696
9697Py_UNICODE*
9698Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9699{
9700 const Py_UNICODE *p;
9701 for (p = s; *p; p++)
9702 if (*p == c)
9703 return (Py_UNICODE*)p;
9704 return NULL;
9705}
9706
9707
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009708#ifdef __cplusplus
9709}
9710#endif
9711
9712
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009713/*
9714Local variables:
9715c-basic-offset: 4
9716indent-tabs-mode: nil
9717End:
9718*/