blob: 3cea89921d192a586e6c6e05152c3bade2e061c0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000172
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Benjamin Peterson29060642009-01-31 22:14:21 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
Benjamin Peterson29060642009-01-31 22:14:21 +0000239#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000379 case SSTATE_NOT_INTERNED:
380 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
384 Py_REFCNT(unicode) = 3;
385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
387 "deletion of interned string failed");
388 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_INTERNED_IMMORTAL:
391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 default:
394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401 PyObject_DEL(unicode->str);
402 unicode->str = NULL;
403 unicode->length = 0;
404 }
405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 PyObject_DEL(unicode->str);
416 Py_XDECREF(unicode->defenc);
417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyErr_BadInternalCall();
429 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
447 Py_DECREF(*unicode);
448 *unicode = w;
449 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
Benjamin Peterson29060642009-01-31 22:14:21 +0000471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000475 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
483 if (!unicode)
484 return NULL;
485 unicode->str[0] = *u;
486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
Benjamin Peterson14339b62009-01-31 16:36:08 +0000508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000510 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000511 return NULL;
512 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000525
526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
529 unicode = unicode_latin1[Py_CHARMASK(*u)];
530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
534 unicode->str[0] = Py_CHARMASK(*u);
535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
Mark Dickinson081dfee2009-03-18 14:47:41 +0000564#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
565# define CONVERT_WCHAR_TO_SURROGATES
566#endif
567
568#ifdef CONVERT_WCHAR_TO_SURROGATES
569
570/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
571 to convert from UTF32 to UTF16. */
572
573PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
574 Py_ssize_t size)
575{
576 PyUnicodeObject *unicode;
577 register Py_ssize_t i;
578 Py_ssize_t alloc;
579 const wchar_t *orig_w;
580
581 if (w == NULL) {
582 if (size == 0)
583 return PyUnicode_FromStringAndSize(NULL, 0);
584 PyErr_BadInternalCall();
585 return NULL;
586 }
587
588 if (size == -1) {
589 size = wcslen(w);
590 }
591
592 alloc = size;
593 orig_w = w;
594 for (i = size; i > 0; i--) {
595 if (*w > 0xFFFF)
596 alloc++;
597 w++;
598 }
599 w = orig_w;
600 unicode = _PyUnicode_New(alloc);
601 if (!unicode)
602 return NULL;
603
604 /* Copy the wchar_t data into the new object */
605 {
606 register Py_UNICODE *u;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--) {
609 if (*w > 0xFFFF) {
610 wchar_t ordinal = *w++;
611 ordinal -= 0x10000;
612 *u++ = 0xD800 | (ordinal >> 10);
613 *u++ = 0xDC00 | (ordinal & 0x3FF);
614 }
615 else
616 *u++ = *w++;
617 }
618 }
619 return (PyObject *)unicode;
620}
621
622#else
623
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000625 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626{
627 PyUnicodeObject *unicode;
628
629 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000630 if (size == 0)
631 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000632 PyErr_BadInternalCall();
633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634 }
635
Martin v. Löwis790465f2008-04-05 20:41:37 +0000636 if (size == -1) {
637 size = wcslen(w);
638 }
639
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640 unicode = _PyUnicode_New(size);
641 if (!unicode)
642 return NULL;
643
644 /* Copy the wchar_t data into the new object */
645#ifdef HAVE_USABLE_WCHAR_T
646 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000647#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 register Py_UNICODE *u;
650 register Py_ssize_t i;
651 u = PyUnicode_AS_UNICODE(unicode);
652 for (i = size; i > 0; i--)
653 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655#endif
656
657 return (PyObject *)unicode;
658}
659
Mark Dickinson081dfee2009-03-18 14:47:41 +0000660#endif /* CONVERT_WCHAR_TO_SURROGATES */
661
662#undef CONVERT_WCHAR_TO_SURROGATES
663
Walter Dörwald346737f2007-05-31 10:44:43 +0000664static void
665makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
666{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000667 *fmt++ = '%';
668 if (width) {
669 if (zeropad)
670 *fmt++ = '0';
671 fmt += sprintf(fmt, "%d", width);
672 }
673 if (precision)
674 fmt += sprintf(fmt, ".%d", precision);
675 if (longflag)
676 *fmt++ = 'l';
677 else if (size_tflag) {
678 char *f = PY_FORMAT_SIZE_T;
679 while (*f)
680 *fmt++ = *f++;
681 }
682 *fmt++ = c;
683 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000684}
685
Walter Dörwaldd2034312007-05-18 16:29:38 +0000686#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
687
688PyObject *
689PyUnicode_FromFormatV(const char *format, va_list vargs)
690{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000691 va_list count;
692 Py_ssize_t callcount = 0;
693 PyObject **callresults = NULL;
694 PyObject **callresult = NULL;
695 Py_ssize_t n = 0;
696 int width = 0;
697 int precision = 0;
698 int zeropad;
699 const char* f;
700 Py_UNICODE *s;
701 PyObject *string;
702 /* used by sprintf */
703 char buffer[21];
704 /* use abuffer instead of buffer, if we need more space
705 * (which can happen if there's a format specifier with width). */
706 char *abuffer = NULL;
707 char *realbuffer;
708 Py_ssize_t abuffersize = 0;
709 char fmt[60]; /* should be enough for %0width.precisionld */
710 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714#else
715#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000716 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#endif
720#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 /* step 1: count the number of %S/%R/%A format specifications
722 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
723 * these objects once during step 3 and put the result in
Benjamin Peterson29060642009-01-31 22:14:21 +0000724 an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 for (f = format; *f; f++) {
726 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
727 ++callcount;
728 }
729 /* step 2: allocate memory for the results of
730 * PyObject_Str()/PyObject_Repr() calls */
731 if (callcount) {
732 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
733 if (!callresults) {
734 PyErr_NoMemory();
735 return NULL;
736 }
737 callresult = callresults;
738 }
739 /* step 3: figure out how large a buffer we need */
740 for (f = format; *f; f++) {
741 if (*f == '%') {
742 const char* p = f;
743 width = 0;
744 while (ISDIGIT((unsigned)*f))
745 width = (width*10) + *f++ - '0';
746 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
747 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Benjamin Peterson14339b62009-01-31 16:36:08 +0000749 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
750 * they don't affect the amount of space we reserve.
751 */
752 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000753 (f[1] == 'd' || f[1] == 'u'))
754 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000755
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 switch (*f) {
757 case 'c':
758 (void)va_arg(count, int);
759 /* fall through... */
760 case '%':
761 n++;
762 break;
763 case 'd': case 'u': case 'i': case 'x':
764 (void) va_arg(count, int);
765 /* 20 bytes is enough to hold a 64-bit
766 integer. Decimal takes the most space.
767 This isn't enough for octal.
768 If a width is specified we need more
769 (which we allocate later). */
770 if (width < 20)
771 width = 20;
772 n += width;
773 if (abuffersize < width)
774 abuffersize = width;
775 break;
776 case 's':
777 {
778 /* UTF-8 */
779 unsigned char*s;
780 s = va_arg(count, unsigned char*);
781 while (*s) {
782 if (*s < 128) {
783 n++; s++;
784 } else if (*s < 0xc0) {
785 /* invalid UTF-8 */
786 n++; s++;
787 } else if (*s < 0xc0) {
788 n++;
789 s++; if(!*s)break;
790 s++;
791 } else if (*s < 0xe0) {
792 n++;
793 s++; if(!*s)break;
794 s++; if(!*s)break;
795 s++;
796 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000797#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 n++;
Benjamin Peterson29060642009-01-31 22:14:21 +0000799#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 n+=2;
Benjamin Peterson29060642009-01-31 22:14:21 +0000801#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 s++; if(!*s)break;
803 s++; if(!*s)break;
804 s++; if(!*s)break;
805 s++;
806 }
807 }
808 break;
809 }
810 case 'U':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 assert(obj && PyUnicode_Check(obj));
814 n += PyUnicode_GET_SIZE(obj);
815 break;
816 }
817 case 'V':
818 {
819 PyObject *obj = va_arg(count, PyObject *);
820 const char *str = va_arg(count, const char *);
821 assert(obj || str);
822 assert(!obj || PyUnicode_Check(obj));
823 if (obj)
824 n += PyUnicode_GET_SIZE(obj);
825 else
826 n += strlen(str);
827 break;
828 }
829 case 'S':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 PyObject *str;
833 assert(obj);
834 str = PyObject_Str(obj);
835 if (!str)
836 goto fail;
837 n += PyUnicode_GET_SIZE(str);
838 /* Remember the str and switch to the next slot */
839 *callresult++ = str;
840 break;
841 }
842 case 'R':
843 {
844 PyObject *obj = va_arg(count, PyObject *);
845 PyObject *repr;
846 assert(obj);
847 repr = PyObject_Repr(obj);
848 if (!repr)
849 goto fail;
850 n += PyUnicode_GET_SIZE(repr);
851 /* Remember the repr and switch to the next slot */
852 *callresult++ = repr;
853 break;
854 }
855 case 'A':
856 {
857 PyObject *obj = va_arg(count, PyObject *);
858 PyObject *ascii;
859 assert(obj);
860 ascii = PyObject_ASCII(obj);
861 if (!ascii)
862 goto fail;
863 n += PyUnicode_GET_SIZE(ascii);
864 /* Remember the repr and switch to the next slot */
865 *callresult++ = ascii;
866 break;
867 }
868 case 'p':
869 (void) va_arg(count, int);
870 /* maximum 64-bit pointer representation:
871 * 0xffffffffffffffff
872 * so 19 characters is enough.
873 * XXX I count 18 -- what's the extra for?
874 */
875 n += 19;
876 break;
877 default:
878 /* if we stumble upon an unknown
879 formatting code, copy the rest of
880 the format string to the output
881 string. (we cannot just skip the
882 code, since there's no way to know
883 what's in the argument list) */
884 n += strlen(p);
885 goto expand;
886 }
887 } else
888 n++;
889 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000890 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000891 if (abuffersize > 20) {
892 abuffer = PyObject_Malloc(abuffersize);
893 if (!abuffer) {
894 PyErr_NoMemory();
895 goto fail;
896 }
897 realbuffer = abuffer;
898 }
899 else
900 realbuffer = buffer;
901 /* step 4: fill the buffer */
902 /* Since we've analyzed how much space we need for the worst case,
903 we don't have to resize the string.
904 There can be no errors beyond this point. */
905 string = PyUnicode_FromUnicode(NULL, n);
906 if (!string)
907 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000908
Benjamin Peterson14339b62009-01-31 16:36:08 +0000909 s = PyUnicode_AS_UNICODE(string);
910 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911
Benjamin Peterson14339b62009-01-31 16:36:08 +0000912 for (f = format; *f; f++) {
913 if (*f == '%') {
914 const char* p = f++;
915 int longflag = 0;
916 int size_tflag = 0;
917 zeropad = (*f == '0');
918 /* parse the width.precision part */
919 width = 0;
920 while (ISDIGIT((unsigned)*f))
921 width = (width*10) + *f++ - '0';
922 precision = 0;
923 if (*f == '.') {
924 f++;
925 while (ISDIGIT((unsigned)*f))
926 precision = (precision*10) + *f++ - '0';
927 }
928 /* handle the long flag, but only for %ld and %lu.
929 others can be added when necessary. */
930 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
931 longflag = 1;
932 ++f;
933 }
934 /* handle the size_t flag. */
935 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
936 size_tflag = 1;
937 ++f;
938 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000939
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 switch (*f) {
941 case 'c':
942 *s++ = va_arg(vargs, int);
943 break;
944 case 'd':
945 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
946 if (longflag)
947 sprintf(realbuffer, fmt, va_arg(vargs, long));
948 else if (size_tflag)
949 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
950 else
951 sprintf(realbuffer, fmt, va_arg(vargs, int));
952 appendstring(realbuffer);
953 break;
954 case 'u':
955 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
956 if (longflag)
957 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
958 else if (size_tflag)
959 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
960 else
961 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
962 appendstring(realbuffer);
963 break;
964 case 'i':
965 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
966 sprintf(realbuffer, fmt, va_arg(vargs, int));
967 appendstring(realbuffer);
968 break;
969 case 'x':
970 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
971 sprintf(realbuffer, fmt, va_arg(vargs, int));
972 appendstring(realbuffer);
973 break;
974 case 's':
975 {
976 /* Parameter must be UTF-8 encoded.
977 In case of encoding errors, use
978 the replacement character. */
979 PyObject *u;
980 p = va_arg(vargs, char*);
981 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Peterson29060642009-01-31 22:14:21 +0000982 "replace");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000983 if (!u)
984 goto fail;
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 PyUnicode_GET_SIZE(u));
Benjamin Peterson14339b62009-01-31 16:36:08 +0000987 s += PyUnicode_GET_SIZE(u);
988 Py_DECREF(u);
989 break;
990 }
991 case 'U':
992 {
993 PyObject *obj = va_arg(vargs, PyObject *);
994 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
995 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
996 s += size;
997 break;
998 }
999 case 'V':
1000 {
1001 PyObject *obj = va_arg(vargs, PyObject *);
1002 const char *str = va_arg(vargs, const char *);
1003 if (obj) {
1004 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1005 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1006 s += size;
1007 } else {
1008 appendstring(str);
1009 }
1010 break;
1011 }
1012 case 'S':
1013 case 'R':
1014 {
1015 Py_UNICODE *ucopy;
1016 Py_ssize_t usize;
1017 Py_ssize_t upos;
1018 /* unused, since we already have the result */
1019 (void) va_arg(vargs, PyObject *);
1020 ucopy = PyUnicode_AS_UNICODE(*callresult);
1021 usize = PyUnicode_GET_SIZE(*callresult);
1022 for (upos = 0; upos<usize;)
1023 *s++ = ucopy[upos++];
1024 /* We're done with the unicode()/repr() => forget it */
1025 Py_DECREF(*callresult);
1026 /* switch to next unicode()/repr() result */
1027 ++callresult;
1028 break;
1029 }
1030 case 'p':
1031 sprintf(buffer, "%p", va_arg(vargs, void*));
1032 /* %p is ill-defined: ensure leading 0x. */
1033 if (buffer[1] == 'X')
1034 buffer[1] = 'x';
1035 else if (buffer[1] != 'x') {
1036 memmove(buffer+2, buffer, strlen(buffer)+1);
1037 buffer[0] = '0';
1038 buffer[1] = 'x';
1039 }
1040 appendstring(buffer);
1041 break;
1042 case '%':
1043 *s++ = '%';
1044 break;
1045 default:
1046 appendstring(p);
1047 goto end;
1048 }
1049 } else
1050 *s++ = *f;
1051 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001052
Benjamin Peterson29060642009-01-31 22:14:21 +00001053 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 if (callresults)
1055 PyObject_Free(callresults);
1056 if (abuffer)
1057 PyObject_Free(abuffer);
1058 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1059 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001060 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001061 if (callresults) {
1062 PyObject **callresult2 = callresults;
1063 while (callresult2 < callresult) {
1064 Py_DECREF(*callresult2);
1065 ++callresult2;
1066 }
1067 PyObject_Free(callresults);
1068 }
1069 if (abuffer)
1070 PyObject_Free(abuffer);
1071 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001072}
1073
1074#undef appendstring
1075
1076PyObject *
1077PyUnicode_FromFormat(const char *format, ...)
1078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 PyObject* ret;
1080 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001081
1082#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001083 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001084#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001085 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001086#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 ret = PyUnicode_FromFormatV(format, vargs);
1088 va_end(vargs);
1089 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001090}
1091
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001093 wchar_t *w,
1094 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095{
1096 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001097 PyErr_BadInternalCall();
1098 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001100
1101 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001103 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105#ifdef HAVE_USABLE_WCHAR_T
1106 memcpy(w, unicode->str, size * sizeof(wchar_t));
1107#else
1108 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 register Py_UNICODE *u;
1110 register Py_ssize_t i;
1111 u = PyUnicode_AS_UNICODE(unicode);
1112 for (i = size; i > 0; i--)
1113 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
1115#endif
1116
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001117 if (size > PyUnicode_GET_SIZE(unicode))
1118 return PyUnicode_GET_SIZE(unicode);
1119 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001120 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121}
1122
1123#endif
1124
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001125PyObject *PyUnicode_FromOrdinal(int ordinal)
1126{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001127 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001128
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001129 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001130 PyErr_SetString(PyExc_ValueError,
1131 "chr() arg not in range(0x110000)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001134
1135#ifndef Py_UNICODE_WIDE
1136 if (ordinal > 0xffff) {
1137 ordinal -= 0x10000;
1138 s[0] = 0xD800 | (ordinal >> 10);
1139 s[1] = 0xDC00 | (ordinal & 0x3FF);
1140 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141 }
1142#endif
1143
Hye-Shik Chang40574832004-04-06 07:24:51 +00001144 s[0] = (Py_UNICODE)ordinal;
1145 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001146}
1147
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148PyObject *PyUnicode_FromObject(register PyObject *obj)
1149{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001151 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001152 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 Py_INCREF(obj);
1154 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 }
1156 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 /* For a Unicode subtype that's not a Unicode object,
1158 return a true Unicode object with the same data. */
1159 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1160 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001161 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001162 PyErr_Format(PyExc_TypeError,
1163 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001164 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001165 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166}
1167
1168PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001169 const char *encoding,
1170 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001171{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001172 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001173 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 PyErr_BadInternalCall();
1178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001180
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001182 PyErr_SetString(PyExc_TypeError,
1183 "decoding str is not supported");
1184 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001185 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001186
1187 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001188 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 s = PyBytes_AS_STRING(obj);
1190 len = PyBytes_GET_SIZE(obj);
1191 }
1192 else if (PyByteArray_Check(obj)) {
1193 s = PyByteArray_AS_STRING(obj);
1194 len = PyByteArray_GET_SIZE(obj);
1195 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001196 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001197 /* Overwrite the error message with something more useful in
1198 case of a TypeError. */
1199 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001200 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001201 "coercing to str: need string or buffer, "
1202 "%.80s found",
1203 Py_TYPE(obj)->tp_name);
1204 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001205 }
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001209 Py_INCREF(unicode_empty);
1210 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 }
Tim Petersced69f82003-09-16 20:30:58 +00001212 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001213 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001214
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001215 return v;
1216
Benjamin Peterson29060642009-01-31 22:14:21 +00001217 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219}
1220
1221PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 Py_ssize_t size,
1223 const char *encoding,
1224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225{
1226 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001227 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001228 char lower[20]; /* Enough for any encoding name we recognize */
1229 char *l;
1230 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231
1232 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001233 encoding = PyUnicode_GetDefaultEncoding();
1234
1235 /* Convert encoding to lower case and replace '_' with '-' in order to
1236 catch e.g. UTF_8 */
1237 e = encoding;
1238 l = lower;
1239 while (*e && l < &lower[(sizeof lower) - 2]) {
1240 if (ISUPPER(*e)) {
1241 *l++ = TOLOWER(*e++);
1242 }
1243 else if (*e == '_') {
1244 *l++ = '-';
1245 e++;
1246 }
1247 else {
1248 *l++ = *e++;
1249 }
1250 }
1251 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001252
1253 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001256 else if ((strcmp(lower, "latin-1") == 0) ||
1257 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001258 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001259#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001260 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001261 return PyUnicode_DecodeMBCS(s, size, errors);
1262#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001263 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001264 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001265 else if (strcmp(lower, "utf-16") == 0)
1266 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1267 else if (strcmp(lower, "utf-32") == 0)
1268 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269
1270 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001271 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001272 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001273 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001274 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (buffer == NULL)
1276 goto onError;
1277 unicode = PyCodec_Decode(buffer, encoding, errors);
1278 if (unicode == NULL)
1279 goto onError;
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001282 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001283 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 Py_DECREF(unicode);
1285 goto onError;
1286 }
1287 Py_DECREF(buffer);
1288 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001289
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 Py_XDECREF(buffer);
1292 return NULL;
1293}
1294
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001295PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1296 const char *encoding,
1297 const char *errors)
1298{
1299 PyObject *v;
1300
1301 if (!PyUnicode_Check(unicode)) {
1302 PyErr_BadArgument();
1303 goto onError;
1304 }
1305
1306 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001308
1309 /* Decode via the codec registry */
1310 v = PyCodec_Decode(unicode, encoding, errors);
1311 if (v == NULL)
1312 goto onError;
1313 return v;
1314
Benjamin Peterson29060642009-01-31 22:14:21 +00001315 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001316 return NULL;
1317}
1318
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001319PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001332
1333 /* Decode via the codec registry */
1334 v = PyCodec_Decode(unicode, encoding, errors);
1335 if (v == NULL)
1336 goto onError;
1337 if (!PyUnicode_Check(v)) {
1338 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001339 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001340 Py_TYPE(v)->tp_name);
1341 Py_DECREF(v);
1342 goto onError;
1343 }
1344 return v;
1345
Benjamin Peterson29060642009-01-31 22:14:21 +00001346 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001347 return NULL;
1348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 Py_ssize_t size,
1352 const char *encoding,
1353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354{
1355 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001356
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 unicode = PyUnicode_FromUnicode(s, size);
1358 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1361 Py_DECREF(unicode);
1362 return v;
1363}
1364
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001365PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1366 const char *encoding,
1367 const char *errors)
1368{
1369 PyObject *v;
1370
1371 if (!PyUnicode_Check(unicode)) {
1372 PyErr_BadArgument();
1373 goto onError;
1374 }
1375
1376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001378
1379 /* Encode via the codec registry */
1380 v = PyCodec_Encode(unicode, encoding, errors);
1381 if (v == NULL)
1382 goto onError;
1383 return v;
1384
Benjamin Peterson29060642009-01-31 22:14:21 +00001385 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001386 return NULL;
1387}
1388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1390 const char *encoding,
1391 const char *errors)
1392{
1393 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001394
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 }
Fred Drakee4315f52000-05-09 19:53:39 +00001399
Tim Petersced69f82003-09-16 20:30:58 +00001400 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001401 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001402
1403 /* Shortcuts for common default encodings */
1404 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 if (strcmp(encoding, "utf-8") == 0)
1406 return PyUnicode_AsUTF8String(unicode);
1407 else if (strcmp(encoding, "latin-1") == 0)
1408 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001409#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 else if (strcmp(encoding, "mbcs") == 0)
1411 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001412#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001413 else if (strcmp(encoding, "ascii") == 0)
1414 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001415 /* During bootstrap, we may need to find the encodings
1416 package, to load the file system encoding, and require the
1417 file system encoding in order to load the encodings
1418 package.
1419
1420 Break out of this dependency by assuming that the path to
1421 the encodings module is ASCII-only. XXX could try wcstombs
1422 instead, if the file system encoding is the locale's
1423 encoding. */
1424 else if (Py_FileSystemDefaultEncoding &&
1425 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1426 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001427 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 /* Encode via the codec registry */
1431 v = PyCodec_Encode(unicode, encoding, errors);
1432 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001433 return NULL;
1434
1435 /* The normal path */
1436 if (PyBytes_Check(v))
1437 return v;
1438
1439 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001440 if (PyByteArray_Check(v)) {
1441 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001442 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001443 PyOS_snprintf(msg, sizeof(msg),
1444 "encoder %s returned buffer instead of bytes",
1445 encoding);
1446 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001447 Py_DECREF(v);
1448 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001450
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001451 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1452 Py_DECREF(v);
1453 return b;
1454 }
1455
1456 PyErr_Format(PyExc_TypeError,
1457 "encoder did not return a bytes object (type=%.400s)",
1458 Py_TYPE(v)->tp_name);
1459 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001460 return NULL;
1461}
1462
1463PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1464 const char *encoding,
1465 const char *errors)
1466{
1467 PyObject *v;
1468
1469 if (!PyUnicode_Check(unicode)) {
1470 PyErr_BadArgument();
1471 goto onError;
1472 }
1473
1474 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001476
1477 /* Encode via the codec registry */
1478 v = PyCodec_Encode(unicode, encoding, errors);
1479 if (v == NULL)
1480 goto onError;
1481 if (!PyUnicode_Check(v)) {
1482 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001483 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001484 Py_TYPE(v)->tp_name);
1485 Py_DECREF(v);
1486 goto onError;
1487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return NULL;
1492}
1493
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001494PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001496{
1497 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001498 if (v)
1499 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001500 if (errors != NULL)
1501 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001502 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001505 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001506 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001507 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001508 return v;
1509}
1510
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001511PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001512PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001513 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001514 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1515}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001516
Christian Heimes5894ba72007-11-04 11:43:14 +00001517PyObject*
1518PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1519{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001520 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1521 can be undefined. If it is case, decode using UTF-8. The following assumes
1522 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1523 bootstrapping process where the codecs aren't ready yet.
1524 */
1525 if (Py_FileSystemDefaultEncoding) {
1526#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001527 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001528 return PyUnicode_DecodeMBCS(s, size, "replace");
1529 }
1530#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001531 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001532 return PyUnicode_DecodeUTF8(s, size, "replace");
1533 }
1534#endif
1535 return PyUnicode_Decode(s, size,
1536 Py_FileSystemDefaultEncoding,
1537 "replace");
1538 }
1539 else {
1540 return PyUnicode_DecodeUTF8(s, size, "replace");
1541 }
1542}
1543
Martin v. Löwis5b222132007-06-10 09:51:05 +00001544char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001545_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001546{
Christian Heimesf3863112007-11-22 07:46:41 +00001547 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001548 if (!PyUnicode_Check(unicode)) {
1549 PyErr_BadArgument();
1550 return NULL;
1551 }
Christian Heimesf3863112007-11-22 07:46:41 +00001552 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1553 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001554 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001555 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001556 *psize = PyBytes_GET_SIZE(bytes);
1557 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001558}
1559
1560char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001561_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001562{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001563 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001564}
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1567{
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572 return PyUnicode_AS_UNICODE(unicode);
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 return NULL;
1576}
1577
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579{
1580 if (!PyUnicode_Check(unicode)) {
1581 PyErr_BadArgument();
1582 goto onError;
1583 }
1584 return PyUnicode_GET_SIZE(unicode);
1585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 return -1;
1588}
1589
Thomas Wouters78890102000-07-22 19:25:51 +00001590const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001591{
1592 return unicode_default_encoding;
1593}
1594
1595int PyUnicode_SetDefaultEncoding(const char *encoding)
1596{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001597 if (strcmp(encoding, unicode_default_encoding) != 0) {
1598 PyErr_Format(PyExc_ValueError,
1599 "Can only set default encoding to %s",
1600 unicode_default_encoding);
1601 return -1;
1602 }
Fred Drakee4315f52000-05-09 19:53:39 +00001603 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001604}
1605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606/* error handling callback helper:
1607 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001608 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001609 and adjust various state variables.
1610 return 0 on success, -1 on error
1611*/
1612
1613static
1614int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 const char *encoding, const char *reason,
1616 const char **input, const char **inend, Py_ssize_t *startinpos,
1617 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1618 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001620 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 PyObject *restuple = NULL;
1623 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001624 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001625 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001626 Py_ssize_t requiredsize;
1627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001629 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001630 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 int res = -1;
1632
1633 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001634 *errorHandler = PyCodec_LookupError(errors);
1635 if (*errorHandler == NULL)
1636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 }
1638
1639 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001640 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001641 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1642 if (*exceptionObject == NULL)
1643 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 }
1645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001646 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1647 goto onError;
1648 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1649 goto onError;
1650 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 }
1653
1654 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1655 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001656 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001658 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 }
1661 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001663
1664 /* Copy back the bytes variables, which might have been modified by the
1665 callback */
1666 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1667 if (!inputobj)
1668 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001669 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001671 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001672 *input = PyBytes_AS_STRING(inputobj);
1673 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001674 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001675 /* we can DECREF safely, as the exception has another reference,
1676 so the object won't go away. */
1677 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001680 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001681 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001682 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1683 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001685
1686 /* need more space? (at least enough for what we
1687 have+the replacement+the rest of the string (starting
1688 at the new input position), so we won't have to check space
1689 when there are no errors in the rest of the string) */
1690 repptr = PyUnicode_AS_UNICODE(repunicode);
1691 repsize = PyUnicode_GET_SIZE(repunicode);
1692 requiredsize = *outpos + repsize + insize-newpos;
1693 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001694 if (requiredsize<2*outsize)
1695 requiredsize = 2*outsize;
1696 if (_PyUnicode_Resize(output, requiredsize) < 0)
1697 goto onError;
1698 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 }
1700 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001701 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 Py_UNICODE_COPY(*outptr, repptr, repsize);
1703 *outptr += repsize;
1704 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 /* we made it! */
1707 res = 0;
1708
Benjamin Peterson29060642009-01-31 22:14:21 +00001709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 Py_XDECREF(restuple);
1711 return res;
1712}
1713
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714/* --- UTF-7 Codec -------------------------------------------------------- */
1715
1716/* see RFC2152 for details */
1717
Tim Petersced69f82003-09-16 20:30:58 +00001718static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719char utf7_special[128] = {
1720 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1721 encoded:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001722 0 - not special
1723 1 - special
1724 2 - whitespace (optional)
1725 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1728 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1730 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1732 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1734
1735};
1736
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001737/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1738 warnings about the comparison always being false; since
1739 utf7_special[0] is 1, we can safely make that one comparison
1740 true */
1741
Benjamin Peterson29060642009-01-31 22:14:21 +00001742#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001743 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 (encodeO && (utf7_special[(c)] == 3)))
1746
Benjamin Peterson29060642009-01-31 22:14:21 +00001747#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001748 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson29060642009-01-31 22:14:21 +00001749#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001750 (ISALNUM(c) || (c) == '+' || (c) == '/')
Benjamin Peterson29060642009-01-31 22:14:21 +00001751#define UB64(c) \
1752 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001753 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001755#define ENCODE(out, ch, bits) \
1756 while (bits >= 6) { \
1757 *out++ = B64(ch >> (bits-6)); \
1758 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 }
1760
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001761#define DECODE(out, ch, bits, surrogate) \
1762 while (bits >= 16) { \
1763 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1764 bits -= 16; \
1765 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001766 /* We have already generated an error for the high surrogate \
1767 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001768 surrogate = 0; \
1769 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001771 it in a 16-bit character */ \
1772 surrogate = 1; \
1773 errmsg = "code pairs are not supported"; \
1774 goto utf7Error; \
1775 } else { \
1776 *out++ = outCh; \
1777 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001778 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 Py_ssize_t size,
1782 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001784 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1785}
1786
1787PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 Py_ssize_t size,
1789 const char *errors,
1790 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001793 Py_ssize_t startinpos;
1794 Py_ssize_t endinpos;
1795 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001796 const char *e;
1797 PyUnicodeObject *unicode;
1798 Py_UNICODE *p;
1799 const char *errmsg = "";
1800 int inShift = 0;
1801 unsigned int bitsleft = 0;
1802 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 int surrogate = 0;
1804 PyObject *errorHandler = NULL;
1805 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001806
1807 unicode = _PyUnicode_New(size);
1808 if (!unicode)
1809 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001810 if (size == 0) {
1811 if (consumed)
1812 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001814 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815
1816 p = unicode->str;
1817 e = s + size;
1818
1819 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001822 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823
1824 if (inShift) {
1825 if ((ch == '-') || !B64CHAR(ch)) {
1826 inShift = 0;
1827 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001828
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1830 if (bitsleft >= 6) {
1831 /* The shift sequence has a partial character in it. If
1832 bitsleft < 6 then we could just classify it as padding
1833 but that is not the case here */
1834
1835 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001836 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 }
1838 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001839 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 here so indicate the potential of a misencoded character. */
1841
1842 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1843 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1844 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001845 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 }
1847
1848 if (ch == '-') {
1849 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001850 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 inShift = 1;
1852 }
1853 } else if (SPECIAL(ch,0,0)) {
1854 errmsg = "unexpected special character";
Benjamin Peterson14339b62009-01-31 16:36:08 +00001855 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 } else {
1857 *p++ = ch;
1858 }
1859 } else {
1860 charsleft = (charsleft << 6) | UB64(ch);
1861 bitsleft += 6;
1862 s++;
1863 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1864 }
1865 }
1866 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001868 s++;
1869 if (s < e && *s == '-') {
1870 s++;
1871 *p++ = '+';
1872 } else
1873 {
1874 inShift = 1;
1875 bitsleft = 0;
1876 }
1877 }
1878 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001879 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880 errmsg = "unexpected special character";
1881 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001882 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883 }
1884 else {
1885 *p++ = ch;
1886 s++;
1887 }
1888 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00001889 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 outpos = p-PyUnicode_AS_UNICODE(unicode);
1891 endinpos = s-starts;
1892 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 errors, &errorHandler,
1894 "utf7", errmsg,
1895 &starts, &e, &startinpos, &endinpos, &exc, &s,
1896 &unicode, &outpos, &p))
1897 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001898 }
1899
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001900 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001901 outpos = p-PyUnicode_AS_UNICODE(unicode);
1902 endinpos = size;
1903 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 errors, &errorHandler,
1905 "utf7", "unterminated shift sequence",
1906 &starts, &e, &startinpos, &endinpos, &exc, &s,
1907 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001908 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 if (s < e)
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001912 if (consumed) {
1913 if(inShift)
1914 *consumed = startinpos;
1915 else
1916 *consumed = s-starts;
1917 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001918
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001919 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920 goto onError;
1921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 Py_XDECREF(errorHandler);
1923 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001924 return (PyObject *)unicode;
1925
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 Py_XDECREF(errorHandler);
1928 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001929 Py_DECREF(unicode);
1930 return NULL;
1931}
1932
1933
1934PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001935 Py_ssize_t size,
1936 int encodeSetO,
1937 int encodeWhiteSpace,
1938 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001940 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001943 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001944 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 unsigned int bitsleft = 0;
1946 unsigned long charsleft = 0;
1947 char * out;
1948 char * start;
1949
1950 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001953 if (cbAllocated / 5 != size)
1954 return PyErr_NoMemory();
1955
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001956 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (v == NULL)
1958 return NULL;
1959
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001960 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001961 for (;i < size; ++i) {
1962 Py_UNICODE ch = s[i];
1963
1964 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001965 if (ch == '+') {
1966 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001967 *out++ = '-';
1968 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1969 charsleft = ch;
1970 bitsleft = 16;
1971 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001972 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001973 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001974 } else {
1975 *out++ = (char) ch;
1976 }
1977 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1979 *out++ = B64(charsleft << (6-bitsleft));
1980 charsleft = 0;
1981 bitsleft = 0;
1982 /* Characters not in the BASE64 set implicitly unshift the sequence
1983 so no '-' is required, except if the character is itself a '-' */
1984 if (B64CHAR(ch) || ch == '-') {
1985 *out++ = '-';
1986 }
1987 inShift = 0;
1988 *out++ = (char) ch;
1989 } else {
1990 bitsleft += 16;
1991 charsleft = (charsleft << 16) | ch;
1992 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1993
Mark Dickinson934896d2009-02-21 20:59:32 +00001994 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001995 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996 or '-' then the shift sequence will be terminated implicitly and we
1997 don't have to insert a '-'. */
1998
1999 if (bitsleft == 0) {
2000 if (i + 1 < size) {
2001 Py_UNICODE ch2 = s[i+1];
2002
2003 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005 } else if (B64CHAR(ch2) || ch2 == '-') {
2006 *out++ = '-';
2007 inShift = 0;
2008 } else {
2009 inShift = 0;
2010 }
2011
2012 }
2013 else {
2014 *out++ = '-';
2015 inShift = 0;
2016 }
2017 }
Tim Petersced69f82003-09-16 20:30:58 +00002018 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002019 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002020 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 if (bitsleft) {
2022 *out++= B64(charsleft << (6-bitsleft) );
2023 *out++ = '-';
2024 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002025 if (_PyBytes_Resize(&v, out - start) < 0)
2026 return NULL;
2027 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028}
2029
2030#undef SPECIAL
2031#undef B64
2032#undef B64CHAR
2033#undef UB64
2034#undef ENCODE
2035#undef DECODE
2036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037/* --- UTF-8 Codec -------------------------------------------------------- */
2038
Tim Petersced69f82003-09-16 20:30:58 +00002039static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040char utf8_code_length[256] = {
2041 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2042 illegal prefix. see RFC 2279 for details */
2043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2051 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2055 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2056 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2057 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2058 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2059};
2060
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 Py_ssize_t size,
2063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064{
Walter Dörwald69652032004-09-07 20:24:22 +00002065 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2066}
2067
Antoine Pitrouab868312009-01-10 15:40:25 +00002068/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2069#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2070
2071/* Mask to quickly check whether a C 'long' contains a
2072 non-ASCII, UTF8-encoded char. */
2073#if (SIZEOF_LONG == 8)
2074# define ASCII_CHAR_MASK 0x8080808080808080L
2075#elif (SIZEOF_LONG == 4)
2076# define ASCII_CHAR_MASK 0x80808080L
2077#else
2078# error C 'long' size should be either 4 or 8!
2079#endif
2080
Walter Dörwald69652032004-09-07 20:24:22 +00002081PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 Py_ssize_t size,
2083 const char *errors,
2084 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t startinpos;
2089 Py_ssize_t endinpos;
2090 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002091 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 PyUnicodeObject *unicode;
2093 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002094 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 PyObject *errorHandler = NULL;
2096 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 /* Note: size will always be longer than the resulting Unicode
2099 character count */
2100 unicode = _PyUnicode_New(size);
2101 if (!unicode)
2102 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002103 if (size == 0) {
2104 if (consumed)
2105 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
2109 /* Unpack UTF-8 encoded data */
2110 p = unicode->str;
2111 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002112 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113
2114 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116
2117 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002118 /* Fast path for runs of ASCII characters. Given that common UTF-8
2119 input will consist of an overwhelming majority of ASCII
2120 characters, we try to optimize for this case by checking
2121 as many characters as a C 'long' can contain.
2122 First, check if we can do an aligned read, as most CPUs have
2123 a penalty for unaligned reads.
2124 */
2125 if (!((size_t) s & LONG_PTR_MASK)) {
2126 /* Help register allocation */
2127 register const char *_s = s;
2128 register Py_UNICODE *_p = p;
2129 while (_s < aligned_end) {
2130 /* Read a whole long at a time (either 4 or 8 bytes),
2131 and do a fast unrolled copy if it only contains ASCII
2132 characters. */
2133 unsigned long data = *(unsigned long *) _s;
2134 if (data & ASCII_CHAR_MASK)
2135 break;
2136 _p[0] = (unsigned char) _s[0];
2137 _p[1] = (unsigned char) _s[1];
2138 _p[2] = (unsigned char) _s[2];
2139 _p[3] = (unsigned char) _s[3];
2140#if (SIZEOF_LONG == 8)
2141 _p[4] = (unsigned char) _s[4];
2142 _p[5] = (unsigned char) _s[5];
2143 _p[6] = (unsigned char) _s[6];
2144 _p[7] = (unsigned char) _s[7];
2145#endif
2146 _s += SIZEOF_LONG;
2147 _p += SIZEOF_LONG;
2148 }
2149 s = _s;
2150 p = _p;
2151 if (s == e)
2152 break;
2153 ch = (unsigned char)*s;
2154 }
2155 }
2156
2157 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002158 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 s++;
2160 continue;
2161 }
2162
2163 n = utf8_code_length[ch];
2164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002165 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 if (consumed)
2167 break;
2168 else {
2169 errmsg = "unexpected end of data";
2170 startinpos = s-starts;
2171 endinpos = size;
2172 goto utf8Error;
2173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175
2176 switch (n) {
2177
2178 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002179 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 startinpos = s-starts;
2181 endinpos = startinpos+1;
2182 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183
2184 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002185 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002186 startinpos = s-starts;
2187 endinpos = startinpos+1;
2188 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
2190 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002191 if ((s[1] & 0xc0) != 0x80) {
2192 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002193 startinpos = s-starts;
2194 endinpos = startinpos+2;
2195 goto utf8Error;
2196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002198 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002199 startinpos = s-starts;
2200 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002201 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002202 goto utf8Error;
2203 }
2204 else
2205 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 break;
2207
2208 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002209 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002210 (s[2] & 0xc0) != 0x80) {
2211 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002212 startinpos = s-starts;
2213 endinpos = startinpos+3;
2214 goto utf8Error;
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002217 if (ch < 0x0800) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 /* Note: UTF-8 encodings of surrogates are considered
2219 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002220
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 XXX For wide builds (UCS-4) we should probably try
2222 to recombine the surrogates into a single code
2223 unit.
2224 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002225 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002226 startinpos = s-starts;
2227 endinpos = startinpos+3;
2228 goto utf8Error;
2229 }
2230 else
2231 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002232 break;
2233
2234 case 4:
2235 if ((s[1] & 0xc0) != 0x80 ||
2236 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002237 (s[3] & 0xc0) != 0x80) {
2238 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 startinpos = s-starts;
2240 endinpos = startinpos+4;
2241 goto utf8Error;
2242 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002244 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002245 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002246 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002247 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002248 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 UTF-16 */
2250 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002251 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 startinpos = s-starts;
2253 endinpos = startinpos+4;
2254 goto utf8Error;
2255 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002256#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002257 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002258#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002260
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 /* translate from 10000..10FFFF to 0..FFFF */
2262 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002263
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002264 /* high surrogate = top 10 bits added to D800 */
2265 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002266
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002267 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002268 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002269#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 break;
2271
2272 default:
2273 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002274 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 startinpos = s-starts;
2276 endinpos = startinpos+n;
2277 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 }
2279 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002280 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002281
Benjamin Peterson29060642009-01-31 22:14:21 +00002282 utf8Error:
2283 outpos = p-PyUnicode_AS_UNICODE(unicode);
2284 if (unicode_decode_call_errorhandler(
2285 errors, &errorHandler,
2286 "utf8", errmsg,
2287 &starts, &e, &startinpos, &endinpos, &exc, &s,
2288 &unicode, &outpos, &p))
2289 goto onError;
2290 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
Walter Dörwald69652032004-09-07 20:24:22 +00002292 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002293 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294
2295 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002296 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 goto onError;
2298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 Py_XDECREF(errorHandler);
2300 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 return (PyObject *)unicode;
2302
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002304 Py_XDECREF(errorHandler);
2305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 Py_DECREF(unicode);
2307 return NULL;
2308}
2309
Antoine Pitrouab868312009-01-10 15:40:25 +00002310#undef ASCII_CHAR_MASK
2311
2312
Tim Peters602f7402002-04-27 18:03:26 +00002313/* Allocation strategy: if the string is short, convert into a stack buffer
2314 and allocate exactly as much space needed at the end. Else allocate the
2315 maximum possible needed (4 result bytes per Unicode character), and return
2316 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002317*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002318PyObject *
2319PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002320 Py_ssize_t size,
2321 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322{
Tim Peters602f7402002-04-27 18:03:26 +00002323#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002324
Guido van Rossum98297ee2007-11-06 21:34:58 +00002325 Py_ssize_t i; /* index into s of next input byte */
2326 PyObject *result; /* result string object */
2327 char *p; /* next free byte in output buffer */
2328 Py_ssize_t nallocated; /* number of result bytes allocated */
2329 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002330 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002331
Tim Peters602f7402002-04-27 18:03:26 +00002332 assert(s != NULL);
2333 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Tim Peters602f7402002-04-27 18:03:26 +00002335 if (size <= MAX_SHORT_UNICHARS) {
2336 /* Write into the stack buffer; nallocated can't overflow.
2337 * At the end, we'll allocate exactly as much heap space as it
2338 * turns out we need.
2339 */
2340 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002341 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002342 p = stackbuf;
2343 }
2344 else {
2345 /* Overallocate on the heap, and give the excess back at the end. */
2346 nallocated = size * 4;
2347 if (nallocated / 4 != size) /* overflow! */
2348 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002349 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002350 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002351 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002352 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002353 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002354
Tim Peters602f7402002-04-27 18:03:26 +00002355 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002356 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002357
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002358 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002359 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002361
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002363 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002364 *p++ = (char)(0xc0 | (ch >> 6));
2365 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002366 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002367 else {
Tim Peters602f7402002-04-27 18:03:26 +00002368 /* Encode UCS2 Unicode ordinals */
2369 if (ch < 0x10000) {
2370 /* Special case: check for high surrogate */
2371 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2372 Py_UCS4 ch2 = s[i];
2373 /* Check for low surrogate and combine the two to
2374 form a UCS4 value */
2375 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002376 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002377 i++;
2378 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002379 }
Tim Peters602f7402002-04-27 18:03:26 +00002380 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002381 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002382 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002383 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2384 *p++ = (char)(0x80 | (ch & 0x3f));
2385 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002388 /* Encode UCS4 Unicode ordinals */
2389 *p++ = (char)(0xf0 | (ch >> 18));
2390 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2391 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2392 *p++ = (char)(0x80 | (ch & 0x3f));
2393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002395
Guido van Rossum98297ee2007-11-06 21:34:58 +00002396 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002397 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002398 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002399 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002400 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002401 }
2402 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002403 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002404 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002405 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002406 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002408 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002409
Tim Peters602f7402002-04-27 18:03:26 +00002410#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411}
2412
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2414{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 if (!PyUnicode_Check(unicode)) {
2416 PyErr_BadArgument();
2417 return NULL;
2418 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002419 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 PyUnicode_GET_SIZE(unicode),
2421 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422}
2423
Walter Dörwald41980ca2007-08-16 21:55:45 +00002424/* --- UTF-32 Codec ------------------------------------------------------- */
2425
2426PyObject *
2427PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002428 Py_ssize_t size,
2429 const char *errors,
2430 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002431{
2432 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2433}
2434
2435PyObject *
2436PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 Py_ssize_t size,
2438 const char *errors,
2439 int *byteorder,
2440 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002441{
2442 const char *starts = s;
2443 Py_ssize_t startinpos;
2444 Py_ssize_t endinpos;
2445 Py_ssize_t outpos;
2446 PyUnicodeObject *unicode;
2447 Py_UNICODE *p;
2448#ifndef Py_UNICODE_WIDE
2449 int i, pairs;
2450#else
2451 const int pairs = 0;
2452#endif
2453 const unsigned char *q, *e;
2454 int bo = 0; /* assume native ordering by default */
2455 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002456 /* Offsets from q for retrieving bytes in the right order. */
2457#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2458 int iorder[] = {0, 1, 2, 3};
2459#else
2460 int iorder[] = {3, 2, 1, 0};
2461#endif
2462 PyObject *errorHandler = NULL;
2463 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002464 /* On narrow builds we split characters outside the BMP into two
2465 codepoints => count how much extra space we need. */
2466#ifndef Py_UNICODE_WIDE
2467 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002468 if (((Py_UCS4 *)s)[i] >= 0x10000)
2469 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002470#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002471
2472 /* This might be one to much, because of a BOM */
2473 unicode = _PyUnicode_New((size+3)/4+pairs);
2474 if (!unicode)
2475 return NULL;
2476 if (size == 0)
2477 return (PyObject *)unicode;
2478
2479 /* Unpack UTF-32 encoded data */
2480 p = unicode->str;
2481 q = (unsigned char *)s;
2482 e = q + size;
2483
2484 if (byteorder)
2485 bo = *byteorder;
2486
2487 /* Check for BOM marks (U+FEFF) in the input and adjust current
2488 byte order setting accordingly. In native mode, the leading BOM
2489 mark is skipped, in all other modes, it is copied to the output
2490 stream as-is (giving a ZWNBSP character). */
2491 if (bo == 0) {
2492 if (size >= 4) {
2493 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002495#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 if (bom == 0x0000FEFF) {
2497 q += 4;
2498 bo = -1;
2499 }
2500 else if (bom == 0xFFFE0000) {
2501 q += 4;
2502 bo = 1;
2503 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002504#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 if (bom == 0x0000FEFF) {
2506 q += 4;
2507 bo = 1;
2508 }
2509 else if (bom == 0xFFFE0000) {
2510 q += 4;
2511 bo = -1;
2512 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002513#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002515 }
2516
2517 if (bo == -1) {
2518 /* force LE */
2519 iorder[0] = 0;
2520 iorder[1] = 1;
2521 iorder[2] = 2;
2522 iorder[3] = 3;
2523 }
2524 else if (bo == 1) {
2525 /* force BE */
2526 iorder[0] = 3;
2527 iorder[1] = 2;
2528 iorder[2] = 1;
2529 iorder[3] = 0;
2530 }
2531
2532 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_UCS4 ch;
2534 /* remaining bytes at the end? (size should be divisible by 4) */
2535 if (e-q<4) {
2536 if (consumed)
2537 break;
2538 errmsg = "truncated data";
2539 startinpos = ((const char *)q)-starts;
2540 endinpos = ((const char *)e)-starts;
2541 goto utf32Error;
2542 /* The remaining input chars are ignored if the callback
2543 chooses to skip the input */
2544 }
2545 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2546 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002547
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 if (ch >= 0x110000)
2549 {
2550 errmsg = "codepoint not in range(0x110000)";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = startinpos+4;
2553 goto utf32Error;
2554 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002555#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 if (ch >= 0x10000)
2557 {
2558 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2559 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2560 }
2561 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002563 *p++ = ch;
2564 q += 4;
2565 continue;
2566 utf32Error:
2567 outpos = p-PyUnicode_AS_UNICODE(unicode);
2568 if (unicode_decode_call_errorhandler(
2569 errors, &errorHandler,
2570 "utf32", errmsg,
2571 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2572 &unicode, &outpos, &p))
2573 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002574 }
2575
2576 if (byteorder)
2577 *byteorder = bo;
2578
2579 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002581
2582 /* Adjust length */
2583 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2584 goto onError;
2585
2586 Py_XDECREF(errorHandler);
2587 Py_XDECREF(exc);
2588 return (PyObject *)unicode;
2589
Benjamin Peterson29060642009-01-31 22:14:21 +00002590 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002591 Py_DECREF(unicode);
2592 Py_XDECREF(errorHandler);
2593 Py_XDECREF(exc);
2594 return NULL;
2595}
2596
2597PyObject *
2598PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 Py_ssize_t size,
2600 const char *errors,
2601 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002602{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002603 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002604 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002605 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002606#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002607 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002608#else
2609 const int pairs = 0;
2610#endif
2611 /* Offsets from p for storing byte pairs in the right order. */
2612#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2613 int iorder[] = {0, 1, 2, 3};
2614#else
2615 int iorder[] = {3, 2, 1, 0};
2616#endif
2617
Benjamin Peterson29060642009-01-31 22:14:21 +00002618#define STORECHAR(CH) \
2619 do { \
2620 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2621 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2622 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2623 p[iorder[0]] = (CH) & 0xff; \
2624 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002625 } while(0)
2626
2627 /* In narrow builds we can output surrogate pairs as one codepoint,
2628 so we need less space. */
2629#ifndef Py_UNICODE_WIDE
2630 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002631 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2632 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2633 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002634#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002635 nsize = (size - pairs + (byteorder == 0));
2636 bytesize = nsize * 4;
2637 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002639 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640 if (v == NULL)
2641 return NULL;
2642
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002643 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002644 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002646 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002647 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002648
2649 if (byteorder == -1) {
2650 /* force LE */
2651 iorder[0] = 0;
2652 iorder[1] = 1;
2653 iorder[2] = 2;
2654 iorder[3] = 3;
2655 }
2656 else if (byteorder == 1) {
2657 /* force BE */
2658 iorder[0] = 3;
2659 iorder[1] = 2;
2660 iorder[2] = 1;
2661 iorder[3] = 0;
2662 }
2663
2664 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002666#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002667 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2668 Py_UCS4 ch2 = *s;
2669 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2670 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2671 s++;
2672 size--;
2673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002675#endif
2676 STORECHAR(ch);
2677 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002678
2679 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002680 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002681#undef STORECHAR
2682}
2683
2684PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2685{
2686 if (!PyUnicode_Check(unicode)) {
2687 PyErr_BadArgument();
2688 return NULL;
2689 }
2690 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 PyUnicode_GET_SIZE(unicode),
2692 NULL,
2693 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002694}
2695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696/* --- UTF-16 Codec ------------------------------------------------------- */
2697
Tim Peters772747b2001-08-09 22:21:55 +00002698PyObject *
2699PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 Py_ssize_t size,
2701 const char *errors,
2702 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703{
Walter Dörwald69652032004-09-07 20:24:22 +00002704 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2705}
2706
Antoine Pitrouab868312009-01-10 15:40:25 +00002707/* Two masks for fast checking of whether a C 'long' may contain
2708 UTF16-encoded surrogate characters. This is an efficient heuristic,
2709 assuming that non-surrogate characters with a code point >= 0x8000 are
2710 rare in most input.
2711 FAST_CHAR_MASK is used when the input is in native byte ordering,
2712 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002713*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002714#if (SIZEOF_LONG == 8)
2715# define FAST_CHAR_MASK 0x8000800080008000L
2716# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2717#elif (SIZEOF_LONG == 4)
2718# define FAST_CHAR_MASK 0x80008000L
2719# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2720#else
2721# error C 'long' size should be either 4 or 8!
2722#endif
2723
Walter Dörwald69652032004-09-07 20:24:22 +00002724PyObject *
2725PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002726 Py_ssize_t size,
2727 const char *errors,
2728 int *byteorder,
2729 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t startinpos;
2733 Py_ssize_t endinpos;
2734 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyUnicodeObject *unicode;
2736 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002737 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002738 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002739 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002740 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002741 /* Offsets from q for retrieving byte pairs in the right order. */
2742#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2743 int ihi = 1, ilo = 0;
2744#else
2745 int ihi = 0, ilo = 1;
2746#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749
2750 /* Note: size will always be longer than the resulting Unicode
2751 character count */
2752 unicode = _PyUnicode_New(size);
2753 if (!unicode)
2754 return NULL;
2755 if (size == 0)
2756 return (PyObject *)unicode;
2757
2758 /* Unpack UTF-16 encoded data */
2759 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002760 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002761 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762
2763 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002764 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002766 /* Check for BOM marks (U+FEFF) in the input and adjust current
2767 byte order setting accordingly. In native mode, the leading BOM
2768 mark is skipped, in all other modes, it is copied to the output
2769 stream as-is (giving a ZWNBSP character). */
2770 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002771 if (size >= 2) {
2772 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002773#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002774 if (bom == 0xFEFF) {
2775 q += 2;
2776 bo = -1;
2777 }
2778 else if (bom == 0xFFFE) {
2779 q += 2;
2780 bo = 1;
2781 }
Tim Petersced69f82003-09-16 20:30:58 +00002782#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 if (bom == 0xFEFF) {
2784 q += 2;
2785 bo = 1;
2786 }
2787 else if (bom == 0xFFFE) {
2788 q += 2;
2789 bo = -1;
2790 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002791#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794
Tim Peters772747b2001-08-09 22:21:55 +00002795 if (bo == -1) {
2796 /* force LE */
2797 ihi = 1;
2798 ilo = 0;
2799 }
2800 else if (bo == 1) {
2801 /* force BE */
2802 ihi = 0;
2803 ilo = 1;
2804 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002805#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2806 native_ordering = ilo < ihi;
2807#else
2808 native_ordering = ilo > ihi;
2809#endif
Tim Peters772747b2001-08-09 22:21:55 +00002810
Antoine Pitrouab868312009-01-10 15:40:25 +00002811 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002812 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002813 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002814 /* First check for possible aligned read of a C 'long'. Unaligned
2815 reads are more expensive, better to defer to another iteration. */
2816 if (!((size_t) q & LONG_PTR_MASK)) {
2817 /* Fast path for runs of non-surrogate chars. */
2818 register const unsigned char *_q = q;
2819 Py_UNICODE *_p = p;
2820 if (native_ordering) {
2821 /* Native ordering is simple: as long as the input cannot
2822 possibly contain a surrogate char, do an unrolled copy
2823 of several 16-bit code points to the target object.
2824 The non-surrogate check is done on several input bytes
2825 at a time (as many as a C 'long' can contain). */
2826 while (_q < aligned_end) {
2827 unsigned long data = * (unsigned long *) _q;
2828 if (data & FAST_CHAR_MASK)
2829 break;
2830 _p[0] = ((unsigned short *) _q)[0];
2831 _p[1] = ((unsigned short *) _q)[1];
2832#if (SIZEOF_LONG == 8)
2833 _p[2] = ((unsigned short *) _q)[2];
2834 _p[3] = ((unsigned short *) _q)[3];
2835#endif
2836 _q += SIZEOF_LONG;
2837 _p += SIZEOF_LONG / 2;
2838 }
2839 }
2840 else {
2841 /* Byteswapped ordering is similar, but we must decompose
2842 the copy bytewise, and take care of zero'ing out the
2843 upper bytes if the target object is in 32-bit units
2844 (that is, in UCS-4 builds). */
2845 while (_q < aligned_end) {
2846 unsigned long data = * (unsigned long *) _q;
2847 if (data & SWAPPED_FAST_CHAR_MASK)
2848 break;
2849 /* Zero upper bytes in UCS-4 builds */
2850#if (Py_UNICODE_SIZE > 2)
2851 _p[0] = 0;
2852 _p[1] = 0;
2853#if (SIZEOF_LONG == 8)
2854 _p[2] = 0;
2855 _p[3] = 0;
2856#endif
2857#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002858 /* Issue #4916; UCS-4 builds on big endian machines must
2859 fill the two last bytes of each 4-byte unit. */
2860#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2861# define OFF 2
2862#else
2863# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002864#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002865 ((unsigned char *) _p)[OFF + 1] = _q[0];
2866 ((unsigned char *) _p)[OFF + 0] = _q[1];
2867 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2868 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2869#if (SIZEOF_LONG == 8)
2870 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2871 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2872 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2873 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2874#endif
2875#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002876 _q += SIZEOF_LONG;
2877 _p += SIZEOF_LONG / 2;
2878 }
2879 }
2880 p = _p;
2881 q = _q;
2882 if (q >= e)
2883 break;
2884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002886
Benjamin Peterson14339b62009-01-31 16:36:08 +00002887 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002888
2889 if (ch < 0xD800 || ch > 0xDFFF) {
2890 *p++ = ch;
2891 continue;
2892 }
2893
2894 /* UTF-16 code pair: */
2895 if (q > e) {
2896 errmsg = "unexpected end of data";
2897 startinpos = (((const char *)q) - 2) - starts;
2898 endinpos = ((const char *)e) + 1 - starts;
2899 goto utf16Error;
2900 }
2901 if (0xD800 <= ch && ch <= 0xDBFF) {
2902 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2903 q += 2;
2904 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002905#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 *p++ = ch;
2907 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002908#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002910#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 continue;
2912 }
2913 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002914 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 startinpos = (((const char *)q)-4)-starts;
2916 endinpos = startinpos+2;
2917 goto utf16Error;
2918 }
2919
Benjamin Peterson14339b62009-01-31 16:36:08 +00002920 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 errmsg = "illegal encoding";
2922 startinpos = (((const char *)q)-2)-starts;
2923 endinpos = startinpos+2;
2924 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002925
Benjamin Peterson29060642009-01-31 22:14:21 +00002926 utf16Error:
2927 outpos = p - PyUnicode_AS_UNICODE(unicode);
2928 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002929 errors,
2930 &errorHandler,
2931 "utf16", errmsg,
2932 &starts,
2933 (const char **)&e,
2934 &startinpos,
2935 &endinpos,
2936 &exc,
2937 (const char **)&q,
2938 &unicode,
2939 &outpos,
2940 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002943 /* remaining byte at the end? (size should be even) */
2944 if (e == q) {
2945 if (!consumed) {
2946 errmsg = "truncated data";
2947 startinpos = ((const char *)q) - starts;
2948 endinpos = ((const char *)e) + 1 - starts;
2949 outpos = p - PyUnicode_AS_UNICODE(unicode);
2950 if (unicode_decode_call_errorhandler(
2951 errors,
2952 &errorHandler,
2953 "utf16", errmsg,
2954 &starts,
2955 (const char **)&e,
2956 &startinpos,
2957 &endinpos,
2958 &exc,
2959 (const char **)&q,
2960 &unicode,
2961 &outpos,
2962 &p))
2963 goto onError;
2964 /* The remaining input chars are ignored if the callback
2965 chooses to skip the input */
2966 }
2967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 if (byteorder)
2970 *byteorder = bo;
2971
Walter Dörwald69652032004-09-07 20:24:22 +00002972 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002974
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002976 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 goto onError;
2978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 Py_XDECREF(errorHandler);
2980 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 return (PyObject *)unicode;
2982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 Py_XDECREF(errorHandler);
2986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 return NULL;
2988}
2989
Antoine Pitrouab868312009-01-10 15:40:25 +00002990#undef FAST_CHAR_MASK
2991#undef SWAPPED_FAST_CHAR_MASK
2992
Tim Peters772747b2001-08-09 22:21:55 +00002993PyObject *
2994PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 Py_ssize_t size,
2996 const char *errors,
2997 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002999 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003000 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003001 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003002#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003003 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003004#else
3005 const int pairs = 0;
3006#endif
Tim Peters772747b2001-08-09 22:21:55 +00003007 /* Offsets from p for storing byte pairs in the right order. */
3008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3009 int ihi = 1, ilo = 0;
3010#else
3011 int ihi = 0, ilo = 1;
3012#endif
3013
Benjamin Peterson29060642009-01-31 22:14:21 +00003014#define STORECHAR(CH) \
3015 do { \
3016 p[ihi] = ((CH) >> 8) & 0xff; \
3017 p[ilo] = (CH) & 0xff; \
3018 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003019 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003021#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003022 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 if (s[i] >= 0x10000)
3024 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003025#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003026 /* 2 * (size + pairs + (byteorder == 0)) */
3027 if (size > PY_SSIZE_T_MAX ||
3028 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003030 nsize = size + pairs + (byteorder == 0);
3031 bytesize = nsize * 2;
3032 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003034 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (v == NULL)
3036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003038 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003041 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003042 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003043
3044 if (byteorder == -1) {
3045 /* force LE */
3046 ihi = 1;
3047 ilo = 0;
3048 }
3049 else if (byteorder == 1) {
3050 /* force BE */
3051 ihi = 0;
3052 ilo = 1;
3053 }
3054
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003055 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 Py_UNICODE ch = *s++;
3057 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003058#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 if (ch >= 0x10000) {
3060 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3061 ch = 0xD800 | ((ch-0x10000) >> 10);
3062 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003063#endif
Tim Peters772747b2001-08-09 22:21:55 +00003064 STORECHAR(ch);
3065 if (ch2)
3066 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003067 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003068
3069 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003070 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003071#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072}
3073
3074PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3075{
3076 if (!PyUnicode_Check(unicode)) {
3077 PyErr_BadArgument();
3078 return NULL;
3079 }
3080 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 PyUnicode_GET_SIZE(unicode),
3082 NULL,
3083 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084}
3085
3086/* --- Unicode Escape Codec ----------------------------------------------- */
3087
Fredrik Lundh06d12682001-01-24 07:59:11 +00003088static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 Py_ssize_t size,
3092 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003095 Py_ssize_t startinpos;
3096 Py_ssize_t endinpos;
3097 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003102 char* message;
3103 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 PyObject *errorHandler = NULL;
3105 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 /* Escaped strings will always be longer than the resulting
3108 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 length after conversion to the true value.
3110 (but if the error callback returns a long replacement string
3111 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 v = _PyUnicode_New(size);
3113 if (v == NULL)
3114 goto onError;
3115 if (size == 0)
3116 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 while (s < end) {
3122 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003123 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 /* Non-escape characters are interpreted as Unicode ordinals */
3127 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003128 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 continue;
3130 }
3131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 /* \ - Escapes */
3134 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003135 c = *s++;
3136 if (s > end)
3137 c = '\0'; /* Invalid after \ */
3138 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139
Benjamin Peterson29060642009-01-31 22:14:21 +00003140 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 case '\n': break;
3142 case '\\': *p++ = '\\'; break;
3143 case '\'': *p++ = '\''; break;
3144 case '\"': *p++ = '\"'; break;
3145 case 'b': *p++ = '\b'; break;
3146 case 'f': *p++ = '\014'; break; /* FF */
3147 case 't': *p++ = '\t'; break;
3148 case 'n': *p++ = '\n'; break;
3149 case 'r': *p++ = '\r'; break;
3150 case 'v': *p++ = '\013'; break; /* VT */
3151 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 case '0': case '1': case '2': case '3':
3155 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003156 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003157 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003158 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003159 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003162 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 break;
3164
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 /* hex escapes */
3166 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003168 digits = 2;
3169 message = "truncated \\xXX escape";
3170 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003174 digits = 4;
3175 message = "truncated \\uXXXX escape";
3176 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003179 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003180 digits = 8;
3181 message = "truncated \\UXXXXXXXX escape";
3182 hexescape:
3183 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 outpos = p-PyUnicode_AS_UNICODE(v);
3185 if (s+digits>end) {
3186 endinpos = size;
3187 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 errors, &errorHandler,
3189 "unicodeescape", "end of string in escape sequence",
3190 &starts, &end, &startinpos, &endinpos, &exc, &s,
3191 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 goto onError;
3193 goto nextByte;
3194 }
3195 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003196 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003197 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198 endinpos = (s+i+1)-starts;
3199 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 errors, &errorHandler,
3201 "unicodeescape", message,
3202 &starts, &end, &startinpos, &endinpos, &exc, &s,
3203 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003204 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003206 }
3207 chr = (chr<<4) & ~0xF;
3208 if (c >= '0' && c <= '9')
3209 chr += c - '0';
3210 else if (c >= 'a' && c <= 'f')
3211 chr += 10 + c - 'a';
3212 else
3213 chr += 10 + c - 'A';
3214 }
3215 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003216 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* _decoding_error will have already written into the
3218 target buffer. */
3219 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003220 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003221 /* when we get here, chr is a 32-bit unicode character */
3222 if (chr <= 0xffff)
3223 /* UCS-2 character */
3224 *p++ = (Py_UNICODE) chr;
3225 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003226 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003227 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003228#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003229 *p++ = chr;
3230#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003231 chr -= 0x10000L;
3232 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003233 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003234#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003235 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 endinpos = s-starts;
3237 outpos = p-PyUnicode_AS_UNICODE(v);
3238 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 errors, &errorHandler,
3240 "unicodeescape", "illegal Unicode character",
3241 &starts, &end, &startinpos, &endinpos, &exc, &s,
3242 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003243 goto onError;
3244 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003245 break;
3246
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003248 case 'N':
3249 message = "malformed \\N character escape";
3250 if (ucnhash_CAPI == NULL) {
3251 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003252 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003253 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003254 if (m == NULL)
3255 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003256 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003257 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003258 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003259 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003260 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003261 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003262 if (ucnhash_CAPI == NULL)
3263 goto ucnhashError;
3264 }
3265 if (*s == '{') {
3266 const char *start = s+1;
3267 /* look for the closing brace */
3268 while (*s != '}' && s < end)
3269 s++;
3270 if (s > start && s < end && *s == '}') {
3271 /* found a name. look it up in the unicode database */
3272 message = "unknown Unicode character name";
3273 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003274 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003275 goto store;
3276 }
3277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 endinpos = s-starts;
3279 outpos = p-PyUnicode_AS_UNICODE(v);
3280 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 errors, &errorHandler,
3282 "unicodeescape", message,
3283 &starts, &end, &startinpos, &endinpos, &exc, &s,
3284 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003285 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003286 break;
3287
3288 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003289 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 message = "\\ at end of string";
3291 s--;
3292 endinpos = s-starts;
3293 outpos = p-PyUnicode_AS_UNICODE(v);
3294 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 errors, &errorHandler,
3296 "unicodeescape", message,
3297 &starts, &end, &startinpos, &endinpos, &exc, &s,
3298 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003299 goto onError;
3300 }
3301 else {
3302 *p++ = '\\';
3303 *p++ = (unsigned char)s[-1];
3304 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003305 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003317 PyErr_SetString(
3318 PyExc_UnicodeError,
3319 "\\N escapes not supported (can't load unicodedata module)"
3320 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003321 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 Py_XDECREF(errorHandler);
3323 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003324 return NULL;
3325
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 Py_XDECREF(errorHandler);
3329 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 return NULL;
3331}
3332
3333/* Return a Unicode-Escape string version of the Unicode object.
3334
3335 If quotes is true, the string is enclosed in u"" or u'' quotes as
3336 appropriate.
3337
3338*/
3339
Thomas Wouters477c8d52006-05-27 19:21:47 +00003340Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 Py_ssize_t size,
3342 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003343{
3344 /* like wcschr, but doesn't stop at NULL characters */
3345
3346 while (size-- > 0) {
3347 if (*s == ch)
3348 return s;
3349 s++;
3350 }
3351
3352 return NULL;
3353}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003354
Walter Dörwald79e913e2007-05-12 11:08:06 +00003355static const char *hexdigits = "0123456789abcdef";
3356
3357PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003360 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003363#ifdef Py_UNICODE_WIDE
3364 const Py_ssize_t expandsize = 10;
3365#else
3366 const Py_ssize_t expandsize = 6;
3367#endif
3368
Thomas Wouters89f507f2006-12-13 04:49:30 +00003369 /* XXX(nnorwitz): rather than over-allocating, it would be
3370 better to choose a different scheme. Perhaps scan the
3371 first N-chars of the string and allocate based on that size.
3372 */
3373 /* Initial allocation is based on the longest-possible unichr
3374 escape.
3375
3376 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3377 unichr, so in this case it's the longest unichr escape. In
3378 narrow (UTF-16) builds this is five chars per source unichr
3379 since there are two unichrs in the surrogate pair, so in narrow
3380 (UTF-16) builds it's not the longest unichr escape.
3381
3382 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3383 so in the narrow (UTF-16) build case it's the longest unichr
3384 escape.
3385 */
3386
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003387 if (size == 0)
3388 return PyBytes_FromStringAndSize(NULL, 0);
3389
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003390 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003392
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003393 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 2
3395 + expandsize*size
3396 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (repr == NULL)
3398 return NULL;
3399
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003400 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 while (size-- > 0) {
3403 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003404
Walter Dörwald79e913e2007-05-12 11:08:06 +00003405 /* Escape backslashes */
3406 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 *p++ = '\\';
3408 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003409 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003410 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003411
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003412#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003413 /* Map 21-bit characters to '\U00xxxxxx' */
3414 else if (ch >= 0x10000) {
3415 *p++ = '\\';
3416 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003417 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3418 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3419 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3420 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3421 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3422 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3423 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3424 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003426 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003427#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3429 else if (ch >= 0xD800 && ch < 0xDC00) {
3430 Py_UNICODE ch2;
3431 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003432
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 ch2 = *s++;
3434 size--;
3435 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3436 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3437 *p++ = '\\';
3438 *p++ = 'U';
3439 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3440 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3441 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3442 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3443 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3444 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3445 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3446 *p++ = hexdigits[ucs & 0x0000000F];
3447 continue;
3448 }
3449 /* Fall through: isolated surrogates are copied as-is */
3450 s--;
3451 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003452 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003453#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003454
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003456 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 *p++ = '\\';
3458 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003459 *p++ = hexdigits[(ch >> 12) & 0x000F];
3460 *p++ = hexdigits[(ch >> 8) & 0x000F];
3461 *p++ = hexdigits[(ch >> 4) & 0x000F];
3462 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003464
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003465 /* Map special whitespace to '\t', \n', '\r' */
3466 else if (ch == '\t') {
3467 *p++ = '\\';
3468 *p++ = 't';
3469 }
3470 else if (ch == '\n') {
3471 *p++ = '\\';
3472 *p++ = 'n';
3473 }
3474 else if (ch == '\r') {
3475 *p++ = '\\';
3476 *p++ = 'r';
3477 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003478
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003479 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003480 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003482 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003483 *p++ = hexdigits[(ch >> 4) & 0x000F];
3484 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003485 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003486
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 /* Copy everything else as-is */
3488 else
3489 *p++ = (char) ch;
3490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003492 assert(p - PyBytes_AS_STRING(repr) > 0);
3493 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3494 return NULL;
3495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496}
3497
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003498PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003500 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 if (!PyUnicode_Check(unicode)) {
3502 PyErr_BadArgument();
3503 return NULL;
3504 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003505 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3506 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003507 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508}
3509
3510/* --- Raw Unicode Escape Codec ------------------------------------------- */
3511
3512PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 Py_ssize_t size,
3514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t startinpos;
3518 Py_ssize_t endinpos;
3519 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 const char *end;
3523 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 /* Escaped strings will always be longer than the resulting
3528 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 length after conversion to the true value. (But decoding error
3530 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 v = _PyUnicode_New(size);
3532 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 end = s + size;
3538 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 unsigned char c;
3540 Py_UCS4 x;
3541 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003542 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 /* Non-escape characters are interpreted as Unicode ordinals */
3545 if (*s != '\\') {
3546 *p++ = (unsigned char)*s++;
3547 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 startinpos = s-starts;
3550
3551 /* \u-escapes are only interpreted iff the number of leading
3552 backslashes if odd */
3553 bs = s;
3554 for (;s < end;) {
3555 if (*s != '\\')
3556 break;
3557 *p++ = (unsigned char)*s++;
3558 }
3559 if (((s - bs) & 1) == 0 ||
3560 s >= end ||
3561 (*s != 'u' && *s != 'U')) {
3562 continue;
3563 }
3564 p--;
3565 count = *s=='u' ? 4 : 8;
3566 s++;
3567
3568 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3569 outpos = p-PyUnicode_AS_UNICODE(v);
3570 for (x = 0, i = 0; i < count; ++i, ++s) {
3571 c = (unsigned char)*s;
3572 if (!ISXDIGIT(c)) {
3573 endinpos = s-starts;
3574 if (unicode_decode_call_errorhandler(
3575 errors, &errorHandler,
3576 "rawunicodeescape", "truncated \\uXXXX",
3577 &starts, &end, &startinpos, &endinpos, &exc, &s,
3578 &v, &outpos, &p))
3579 goto onError;
3580 goto nextByte;
3581 }
3582 x = (x<<4) & ~0xF;
3583 if (c >= '0' && c <= '9')
3584 x += c - '0';
3585 else if (c >= 'a' && c <= 'f')
3586 x += 10 + c - 'a';
3587 else
3588 x += 10 + c - 'A';
3589 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003590 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 /* UCS-2 character */
3592 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003593 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003594 /* UCS-4 character. Either store directly, or as
3595 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003596#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003598#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 x -= 0x10000L;
3600 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3601 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003602#endif
3603 } else {
3604 endinpos = s-starts;
3605 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003606 if (unicode_decode_call_errorhandler(
3607 errors, &errorHandler,
3608 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 &starts, &end, &startinpos, &endinpos, &exc, &s,
3610 &v, &outpos, &p))
3611 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 nextByte:
3614 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003616 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 Py_XDECREF(errorHandler);
3619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003621
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 Py_XDECREF(errorHandler);
3625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return NULL;
3627}
3628
3629PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003632 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 char *p;
3634 char *q;
3635
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003636#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003637 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003638#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003639 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003640#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003641
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003645 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (repr == NULL)
3647 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003648 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003649 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003651 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 while (size-- > 0) {
3653 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003654#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 /* Map 32-bit characters to '\Uxxxxxxxx' */
3656 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003657 *p++ = '\\';
3658 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003659 *p++ = hexdigits[(ch >> 28) & 0xf];
3660 *p++ = hexdigits[(ch >> 24) & 0xf];
3661 *p++ = hexdigits[(ch >> 20) & 0xf];
3662 *p++ = hexdigits[(ch >> 16) & 0xf];
3663 *p++ = hexdigits[(ch >> 12) & 0xf];
3664 *p++ = hexdigits[(ch >> 8) & 0xf];
3665 *p++ = hexdigits[(ch >> 4) & 0xf];
3666 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003667 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003668 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003669#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3671 if (ch >= 0xD800 && ch < 0xDC00) {
3672 Py_UNICODE ch2;
3673 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003674
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 ch2 = *s++;
3676 size--;
3677 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3678 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3679 *p++ = '\\';
3680 *p++ = 'U';
3681 *p++ = hexdigits[(ucs >> 28) & 0xf];
3682 *p++ = hexdigits[(ucs >> 24) & 0xf];
3683 *p++ = hexdigits[(ucs >> 20) & 0xf];
3684 *p++ = hexdigits[(ucs >> 16) & 0xf];
3685 *p++ = hexdigits[(ucs >> 12) & 0xf];
3686 *p++ = hexdigits[(ucs >> 8) & 0xf];
3687 *p++ = hexdigits[(ucs >> 4) & 0xf];
3688 *p++ = hexdigits[ucs & 0xf];
3689 continue;
3690 }
3691 /* Fall through: isolated surrogates are copied as-is */
3692 s--;
3693 size++;
3694 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003695#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 /* Map 16-bit characters to '\uxxxx' */
3697 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 *p++ = '\\';
3699 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003700 *p++ = hexdigits[(ch >> 12) & 0xf];
3701 *p++ = hexdigits[(ch >> 8) & 0xf];
3702 *p++ = hexdigits[(ch >> 4) & 0xf];
3703 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 /* Copy everything else as-is */
3706 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 *p++ = (char) ch;
3708 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003709 size = p - q;
3710
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003711 assert(size > 0);
3712 if (_PyBytes_Resize(&repr, size) < 0)
3713 return NULL;
3714 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3718{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003719 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003721 PyErr_BadArgument();
3722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003724 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3725 PyUnicode_GET_SIZE(unicode));
3726
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003727 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728}
3729
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003730/* --- Unicode Internal Codec ------------------------------------------- */
3731
3732PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 Py_ssize_t size,
3734 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003735{
3736 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t startinpos;
3738 Py_ssize_t endinpos;
3739 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003740 PyUnicodeObject *v;
3741 Py_UNICODE *p;
3742 const char *end;
3743 const char *reason;
3744 PyObject *errorHandler = NULL;
3745 PyObject *exc = NULL;
3746
Neal Norwitzd43069c2006-01-08 01:12:10 +00003747#ifdef Py_UNICODE_WIDE
3748 Py_UNICODE unimax = PyUnicode_GetMax();
3749#endif
3750
Thomas Wouters89f507f2006-12-13 04:49:30 +00003751 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003752 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3753 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003755 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003757 p = PyUnicode_AS_UNICODE(v);
3758 end = s + size;
3759
3760 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003761 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003762 /* We have to sanity check the raw data, otherwise doom looms for
3763 some malformed UCS-4 data. */
3764 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003765#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003766 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003767#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003768 end-s < Py_UNICODE_SIZE
3769 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003771 startinpos = s - starts;
3772 if (end-s < Py_UNICODE_SIZE) {
3773 endinpos = end-starts;
3774 reason = "truncated input";
3775 }
3776 else {
3777 endinpos = s - starts + Py_UNICODE_SIZE;
3778 reason = "illegal code point (> 0x10FFFF)";
3779 }
3780 outpos = p - PyUnicode_AS_UNICODE(v);
3781 if (unicode_decode_call_errorhandler(
3782 errors, &errorHandler,
3783 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003785 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003786 goto onError;
3787 }
3788 }
3789 else {
3790 p++;
3791 s += Py_UNICODE_SIZE;
3792 }
3793 }
3794
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003795 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003796 goto onError;
3797 Py_XDECREF(errorHandler);
3798 Py_XDECREF(exc);
3799 return (PyObject *)v;
3800
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003802 Py_XDECREF(v);
3803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
3805 return NULL;
3806}
3807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808/* --- Latin-1 Codec ------------------------------------------------------ */
3809
3810PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 Py_ssize_t size,
3812 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813{
3814 PyUnicodeObject *v;
3815 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003816 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003819 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 Py_UNICODE r = *(unsigned char*)s;
3821 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003822 }
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 v = _PyUnicode_New(size);
3825 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003830 e = s + size;
3831 /* Unrolling the copy makes it much faster by reducing the looping
3832 overhead. This is similar to what many memcpy() implementations do. */
3833 unrolled_end = e - 4;
3834 while (s < unrolled_end) {
3835 p[0] = (unsigned char) s[0];
3836 p[1] = (unsigned char) s[1];
3837 p[2] = (unsigned char) s[2];
3838 p[3] = (unsigned char) s[3];
3839 s += 4;
3840 p += 4;
3841 }
3842 while (s < e)
3843 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003845
Benjamin Peterson29060642009-01-31 22:14:21 +00003846 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 Py_XDECREF(v);
3848 return NULL;
3849}
3850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851/* create or adjust a UnicodeEncodeError */
3852static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 const char *encoding,
3854 const Py_UNICODE *unicode, Py_ssize_t size,
3855 Py_ssize_t startpos, Py_ssize_t endpos,
3856 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 *exceptionObject = PyUnicodeEncodeError_Create(
3860 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 }
3862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3864 goto onError;
3865 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3866 goto onError;
3867 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3868 goto onError;
3869 return;
3870 onError:
3871 Py_DECREF(*exceptionObject);
3872 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 }
3874}
3875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876/* raises a UnicodeEncodeError */
3877static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 const char *encoding,
3879 const Py_UNICODE *unicode, Py_ssize_t size,
3880 Py_ssize_t startpos, Py_ssize_t endpos,
3881 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882{
3883 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887}
3888
3889/* error handling callback helper:
3890 build arguments, call the callback and check the arguments,
3891 put the result into newpos and return the replacement string, which
3892 has to be freed by the caller */
3893static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 PyObject **errorHandler,
3895 const char *encoding, const char *reason,
3896 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3897 Py_ssize_t startpos, Py_ssize_t endpos,
3898 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003900 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901
3902 PyObject *restuple;
3903 PyObject *resunicode;
3904
3905 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 }
3910
3911 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915
3916 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003921 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 Py_DECREF(restuple);
3923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 }
3925 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 &resunicode, newpos)) {
3927 Py_DECREF(restuple);
3928 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
3930 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003931 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003932 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3934 Py_DECREF(restuple);
3935 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 Py_INCREF(resunicode);
3938 Py_DECREF(restuple);
3939 return resunicode;
3940}
3941
3942static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_ssize_t size,
3944 const char *errors,
3945 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946{
3947 /* output object */
3948 PyObject *res;
3949 /* pointers to the beginning and end+1 of input */
3950 const Py_UNICODE *startp = p;
3951 const Py_UNICODE *endp = p + size;
3952 /* pointer to the beginning of the unencodable characters */
3953 /* const Py_UNICODE *badp = NULL; */
3954 /* pointer into the output */
3955 char *str;
3956 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003958 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3959 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 PyObject *errorHandler = NULL;
3961 PyObject *exc = NULL;
3962 /* the following variable is used for caching string comparisons
3963 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3964 int known_errorHandler = -1;
3965
3966 /* allocate enough for a simple encoding without
3967 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003968 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003969 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003970 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003972 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003973 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 ressize = size;
3975
3976 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 /* can we encode this? */
3980 if (c<limit) {
3981 /* no overflow check, because we know that the space is enough */
3982 *str++ = (char)c;
3983 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 else {
3986 Py_ssize_t unicodepos = p-startp;
3987 Py_ssize_t requiredsize;
3988 PyObject *repunicode;
3989 Py_ssize_t repsize;
3990 Py_ssize_t newpos;
3991 Py_ssize_t respos;
3992 Py_UNICODE *uni2;
3993 /* startpos for collecting unencodable chars */
3994 const Py_UNICODE *collstart = p;
3995 const Py_UNICODE *collend = p;
3996 /* find all unecodable characters */
3997 while ((collend < endp) && ((*collend)>=limit))
3998 ++collend;
3999 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4000 if (known_errorHandler==-1) {
4001 if ((errors==NULL) || (!strcmp(errors, "strict")))
4002 known_errorHandler = 1;
4003 else if (!strcmp(errors, "replace"))
4004 known_errorHandler = 2;
4005 else if (!strcmp(errors, "ignore"))
4006 known_errorHandler = 3;
4007 else if (!strcmp(errors, "xmlcharrefreplace"))
4008 known_errorHandler = 4;
4009 else
4010 known_errorHandler = 0;
4011 }
4012 switch (known_errorHandler) {
4013 case 1: /* strict */
4014 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4015 goto onError;
4016 case 2: /* replace */
4017 while (collstart++<collend)
4018 *str++ = '?'; /* fall through */
4019 case 3: /* ignore */
4020 p = collend;
4021 break;
4022 case 4: /* xmlcharrefreplace */
4023 respos = str - PyBytes_AS_STRING(res);
4024 /* determine replacement size (temporarily (mis)uses p) */
4025 for (p = collstart, repsize = 0; p < collend; ++p) {
4026 if (*p<10)
4027 repsize += 2+1+1;
4028 else if (*p<100)
4029 repsize += 2+2+1;
4030 else if (*p<1000)
4031 repsize += 2+3+1;
4032 else if (*p<10000)
4033 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004034#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 else
4036 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 else if (*p<100000)
4039 repsize += 2+5+1;
4040 else if (*p<1000000)
4041 repsize += 2+6+1;
4042 else
4043 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 }
4046 requiredsize = respos+repsize+(endp-collend);
4047 if (requiredsize > ressize) {
4048 if (requiredsize<2*ressize)
4049 requiredsize = 2*ressize;
4050 if (_PyBytes_Resize(&res, requiredsize))
4051 goto onError;
4052 str = PyBytes_AS_STRING(res) + respos;
4053 ressize = requiredsize;
4054 }
4055 /* generate replacement (temporarily (mis)uses p) */
4056 for (p = collstart; p < collend; ++p) {
4057 str += sprintf(str, "&#%d;", (int)*p);
4058 }
4059 p = collend;
4060 break;
4061 default:
4062 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4063 encoding, reason, startp, size, &exc,
4064 collstart-startp, collend-startp, &newpos);
4065 if (repunicode == NULL)
4066 goto onError;
4067 /* need more space? (at least enough for what we
4068 have+the replacement+the rest of the string, so
4069 we won't have to check space for encodable characters) */
4070 respos = str - PyBytes_AS_STRING(res);
4071 repsize = PyUnicode_GET_SIZE(repunicode);
4072 requiredsize = respos+repsize+(endp-collend);
4073 if (requiredsize > ressize) {
4074 if (requiredsize<2*ressize)
4075 requiredsize = 2*ressize;
4076 if (_PyBytes_Resize(&res, requiredsize)) {
4077 Py_DECREF(repunicode);
4078 goto onError;
4079 }
4080 str = PyBytes_AS_STRING(res) + respos;
4081 ressize = requiredsize;
4082 }
4083 /* check if there is anything unencodable in the replacement
4084 and copy it to the output */
4085 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4086 c = *uni2;
4087 if (c >= limit) {
4088 raise_encode_exception(&exc, encoding, startp, size,
4089 unicodepos, unicodepos+1, reason);
4090 Py_DECREF(repunicode);
4091 goto onError;
4092 }
4093 *str = (char)c;
4094 }
4095 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004096 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004098 }
4099 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004100 /* Resize if we allocated to much */
4101 size = str - PyBytes_AS_STRING(res);
4102 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004103 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004104 if (_PyBytes_Resize(&res, size) < 0)
4105 goto onError;
4106 }
4107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004110 return res;
4111
4112 onError:
4113 Py_XDECREF(res);
4114 Py_XDECREF(errorHandler);
4115 Py_XDECREF(exc);
4116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117}
4118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 Py_ssize_t size,
4121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124}
4125
4126PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4127{
4128 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 PyErr_BadArgument();
4130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 }
4132 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 PyUnicode_GET_SIZE(unicode),
4134 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135}
4136
4137/* --- 7-bit ASCII Codec -------------------------------------------------- */
4138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 Py_ssize_t size,
4141 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 PyUnicodeObject *v;
4145 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004146 Py_ssize_t startinpos;
4147 Py_ssize_t endinpos;
4148 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 const char *e;
4150 PyObject *errorHandler = NULL;
4151 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004152
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004154 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 Py_UNICODE r = *(unsigned char*)s;
4156 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004157 }
Tim Petersced69f82003-09-16 20:30:58 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 v = _PyUnicode_New(size);
4160 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 e = s + size;
4166 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 register unsigned char c = (unsigned char)*s;
4168 if (c < 128) {
4169 *p++ = c;
4170 ++s;
4171 }
4172 else {
4173 startinpos = s-starts;
4174 endinpos = startinpos + 1;
4175 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4176 if (unicode_decode_call_errorhandler(
4177 errors, &errorHandler,
4178 "ascii", "ordinal not in range(128)",
4179 &starts, &e, &startinpos, &endinpos, &exc, &s,
4180 &v, &outpos, &p))
4181 goto onError;
4182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004184 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4186 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 Py_XDECREF(errorHandler);
4188 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004190
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 Py_XDECREF(errorHandler);
4194 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 return NULL;
4196}
4197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 Py_ssize_t size,
4200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203}
4204
4205PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4206{
4207 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 PyErr_BadArgument();
4209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
4211 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 PyUnicode_GET_SIZE(unicode),
4213 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214}
4215
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004216#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004217
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004218/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004219
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004220#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004221#define NEED_RETRY
4222#endif
4223
4224/* XXX This code is limited to "true" double-byte encodings, as
4225 a) it assumes an incomplete character consists of a single byte, and
4226 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004228
4229static int is_dbcs_lead_byte(const char *s, int offset)
4230{
4231 const char *curr = s + offset;
4232
4233 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 const char *prev = CharPrev(s, curr);
4235 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004236 }
4237 return 0;
4238}
4239
4240/*
4241 * Decode MBCS string into unicode object. If 'final' is set, converts
4242 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4243 */
4244static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 const char *s, /* MBCS string */
4246 int size, /* sizeof MBCS string */
4247 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004248{
4249 Py_UNICODE *p;
4250 Py_ssize_t n = 0;
4251 int usize = 0;
4252
4253 assert(size >= 0);
4254
4255 /* Skip trailing lead-byte unless 'final' is set */
4256 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004258
4259 /* First get the size of the result */
4260 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4262 if (usize == 0) {
4263 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4264 return -1;
4265 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004266 }
4267
4268 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004269 /* Create unicode object */
4270 *v = _PyUnicode_New(usize);
4271 if (*v == NULL)
4272 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004273 }
4274 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 /* Extend unicode object */
4276 n = PyUnicode_GET_SIZE(*v);
4277 if (_PyUnicode_Resize(v, n + usize) < 0)
4278 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004279 }
4280
4281 /* Do the conversion */
4282 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 p = PyUnicode_AS_UNICODE(*v) + n;
4284 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4285 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4286 return -1;
4287 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004288 }
4289
4290 return size;
4291}
4292
4293PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_ssize_t size,
4295 const char *errors,
4296 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004297{
4298 PyUnicodeObject *v = NULL;
4299 int done;
4300
4301 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004303
4304#ifdef NEED_RETRY
4305 retry:
4306 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004308 else
4309#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004311
4312 if (done < 0) {
4313 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004315 }
4316
4317 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004319
4320#ifdef NEED_RETRY
4321 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 s += done;
4323 size -= done;
4324 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004325 }
4326#endif
4327
4328 return (PyObject *)v;
4329}
4330
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004331PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 Py_ssize_t size,
4333 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004334{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004335 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4336}
4337
4338/*
4339 * Convert unicode into string object (MBCS).
4340 * Returns 0 if succeed, -1 otherwise.
4341 */
4342static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 const Py_UNICODE *p, /* unicode */
4344 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004345{
4346 int mbcssize = 0;
4347 Py_ssize_t n = 0;
4348
4349 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004350
4351 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004352 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4354 if (mbcssize == 0) {
4355 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4356 return -1;
4357 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004358 }
4359
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004360 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 /* Create string object */
4362 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4363 if (*repr == NULL)
4364 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004365 }
4366 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 /* Extend string object */
4368 n = PyBytes_Size(*repr);
4369 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4370 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004371 }
4372
4373 /* Do the conversion */
4374 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 char *s = PyBytes_AS_STRING(*repr) + n;
4376 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4377 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4378 return -1;
4379 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004380 }
4381
4382 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004383}
4384
4385PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 Py_ssize_t size,
4387 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004388{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004389 PyObject *repr = NULL;
4390 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004391
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004392#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004394 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004396 else
4397#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004399
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004400 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 Py_XDECREF(repr);
4402 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004403 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004404
4405#ifdef NEED_RETRY
4406 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 p += INT_MAX;
4408 size -= INT_MAX;
4409 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004410 }
4411#endif
4412
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004413 return repr;
4414}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004415
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004416PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4417{
4418 if (!PyUnicode_Check(unicode)) {
4419 PyErr_BadArgument();
4420 return NULL;
4421 }
4422 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 PyUnicode_GET_SIZE(unicode),
4424 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004425}
4426
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004427#undef NEED_RETRY
4428
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004429#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004430
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431/* --- Character Mapping Codec -------------------------------------------- */
4432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 Py_ssize_t size,
4435 PyObject *mapping,
4436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 Py_ssize_t startinpos;
4440 Py_ssize_t endinpos;
4441 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 PyUnicodeObject *v;
4444 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 PyObject *errorHandler = NULL;
4447 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004448 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 /* Default to Latin-1 */
4452 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
4455 v = _PyUnicode_New(size);
4456 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004462 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 mapstring = PyUnicode_AS_UNICODE(mapping);
4464 maplen = PyUnicode_GET_SIZE(mapping);
4465 while (s < e) {
4466 unsigned char ch = *s;
4467 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 if (ch < maplen)
4470 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 if (x == 0xfffe) {
4473 /* undefined mapping */
4474 outpos = p-PyUnicode_AS_UNICODE(v);
4475 startinpos = s-starts;
4476 endinpos = startinpos+1;
4477 if (unicode_decode_call_errorhandler(
4478 errors, &errorHandler,
4479 "charmap", "character maps to <undefined>",
4480 &starts, &e, &startinpos, &endinpos, &exc, &s,
4481 &v, &outpos, &p)) {
4482 goto onError;
4483 }
4484 continue;
4485 }
4486 *p++ = x;
4487 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004488 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004489 }
4490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 while (s < e) {
4492 unsigned char ch = *s;
4493 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004494
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4496 w = PyLong_FromLong((long)ch);
4497 if (w == NULL)
4498 goto onError;
4499 x = PyObject_GetItem(mapping, w);
4500 Py_DECREF(w);
4501 if (x == NULL) {
4502 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4503 /* No mapping found means: mapping is undefined. */
4504 PyErr_Clear();
4505 x = Py_None;
4506 Py_INCREF(x);
4507 } else
4508 goto onError;
4509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004510
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 /* Apply mapping */
4512 if (PyLong_Check(x)) {
4513 long value = PyLong_AS_LONG(x);
4514 if (value < 0 || value > 65535) {
4515 PyErr_SetString(PyExc_TypeError,
4516 "character mapping must be in range(65536)");
4517 Py_DECREF(x);
4518 goto onError;
4519 }
4520 *p++ = (Py_UNICODE)value;
4521 }
4522 else if (x == Py_None) {
4523 /* undefined mapping */
4524 outpos = p-PyUnicode_AS_UNICODE(v);
4525 startinpos = s-starts;
4526 endinpos = startinpos+1;
4527 if (unicode_decode_call_errorhandler(
4528 errors, &errorHandler,
4529 "charmap", "character maps to <undefined>",
4530 &starts, &e, &startinpos, &endinpos, &exc, &s,
4531 &v, &outpos, &p)) {
4532 Py_DECREF(x);
4533 goto onError;
4534 }
4535 Py_DECREF(x);
4536 continue;
4537 }
4538 else if (PyUnicode_Check(x)) {
4539 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004540
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 if (targetsize == 1)
4542 /* 1-1 mapping */
4543 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004544
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 else if (targetsize > 1) {
4546 /* 1-n mapping */
4547 if (targetsize > extrachars) {
4548 /* resize first */
4549 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4550 Py_ssize_t needed = (targetsize - extrachars) + \
4551 (targetsize << 2);
4552 extrachars += needed;
4553 /* XXX overflow detection missing */
4554 if (_PyUnicode_Resize(&v,
4555 PyUnicode_GET_SIZE(v) + needed) < 0) {
4556 Py_DECREF(x);
4557 goto onError;
4558 }
4559 p = PyUnicode_AS_UNICODE(v) + oldpos;
4560 }
4561 Py_UNICODE_COPY(p,
4562 PyUnicode_AS_UNICODE(x),
4563 targetsize);
4564 p += targetsize;
4565 extrachars -= targetsize;
4566 }
4567 /* 1-0 mapping: skip the character */
4568 }
4569 else {
4570 /* wrong return value */
4571 PyErr_SetString(PyExc_TypeError,
4572 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004573 Py_DECREF(x);
4574 goto onError;
4575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 Py_DECREF(x);
4577 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
4580 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_XDECREF(errorHandler);
4584 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004586
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 Py_XDECREF(v);
4591 return NULL;
4592}
4593
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004594/* Charmap encoding: the lookup table */
4595
4596struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 PyObject_HEAD
4598 unsigned char level1[32];
4599 int count2, count3;
4600 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601};
4602
4603static PyObject*
4604encoding_map_size(PyObject *obj, PyObject* args)
4605{
4606 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004607 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004609}
4610
4611static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004612 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 PyDoc_STR("Return the size (in bytes) of this object") },
4614 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615};
4616
4617static void
4618encoding_map_dealloc(PyObject* o)
4619{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004620 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004621}
4622
4623static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004624 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 "EncodingMap", /*tp_name*/
4626 sizeof(struct encoding_map), /*tp_basicsize*/
4627 0, /*tp_itemsize*/
4628 /* methods */
4629 encoding_map_dealloc, /*tp_dealloc*/
4630 0, /*tp_print*/
4631 0, /*tp_getattr*/
4632 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004633 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 0, /*tp_repr*/
4635 0, /*tp_as_number*/
4636 0, /*tp_as_sequence*/
4637 0, /*tp_as_mapping*/
4638 0, /*tp_hash*/
4639 0, /*tp_call*/
4640 0, /*tp_str*/
4641 0, /*tp_getattro*/
4642 0, /*tp_setattro*/
4643 0, /*tp_as_buffer*/
4644 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4645 0, /*tp_doc*/
4646 0, /*tp_traverse*/
4647 0, /*tp_clear*/
4648 0, /*tp_richcompare*/
4649 0, /*tp_weaklistoffset*/
4650 0, /*tp_iter*/
4651 0, /*tp_iternext*/
4652 encoding_map_methods, /*tp_methods*/
4653 0, /*tp_members*/
4654 0, /*tp_getset*/
4655 0, /*tp_base*/
4656 0, /*tp_dict*/
4657 0, /*tp_descr_get*/
4658 0, /*tp_descr_set*/
4659 0, /*tp_dictoffset*/
4660 0, /*tp_init*/
4661 0, /*tp_alloc*/
4662 0, /*tp_new*/
4663 0, /*tp_free*/
4664 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004665};
4666
4667PyObject*
4668PyUnicode_BuildEncodingMap(PyObject* string)
4669{
4670 Py_UNICODE *decode;
4671 PyObject *result;
4672 struct encoding_map *mresult;
4673 int i;
4674 int need_dict = 0;
4675 unsigned char level1[32];
4676 unsigned char level2[512];
4677 unsigned char *mlevel1, *mlevel2, *mlevel3;
4678 int count2 = 0, count3 = 0;
4679
4680 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4681 PyErr_BadArgument();
4682 return NULL;
4683 }
4684 decode = PyUnicode_AS_UNICODE(string);
4685 memset(level1, 0xFF, sizeof level1);
4686 memset(level2, 0xFF, sizeof level2);
4687
4688 /* If there isn't a one-to-one mapping of NULL to \0,
4689 or if there are non-BMP characters, we need to use
4690 a mapping dictionary. */
4691 if (decode[0] != 0)
4692 need_dict = 1;
4693 for (i = 1; i < 256; i++) {
4694 int l1, l2;
4695 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004696#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004698#endif
4699 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004700 need_dict = 1;
4701 break;
4702 }
4703 if (decode[i] == 0xFFFE)
4704 /* unmapped character */
4705 continue;
4706 l1 = decode[i] >> 11;
4707 l2 = decode[i] >> 7;
4708 if (level1[l1] == 0xFF)
4709 level1[l1] = count2++;
4710 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004711 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004712 }
4713
4714 if (count2 >= 0xFF || count3 >= 0xFF)
4715 need_dict = 1;
4716
4717 if (need_dict) {
4718 PyObject *result = PyDict_New();
4719 PyObject *key, *value;
4720 if (!result)
4721 return NULL;
4722 for (i = 0; i < 256; i++) {
4723 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004724 key = PyLong_FromLong(decode[i]);
4725 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004726 if (!key || !value)
4727 goto failed1;
4728 if (PyDict_SetItem(result, key, value) == -1)
4729 goto failed1;
4730 Py_DECREF(key);
4731 Py_DECREF(value);
4732 }
4733 return result;
4734 failed1:
4735 Py_XDECREF(key);
4736 Py_XDECREF(value);
4737 Py_DECREF(result);
4738 return NULL;
4739 }
4740
4741 /* Create a three-level trie */
4742 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4743 16*count2 + 128*count3 - 1);
4744 if (!result)
4745 return PyErr_NoMemory();
4746 PyObject_Init(result, &EncodingMapType);
4747 mresult = (struct encoding_map*)result;
4748 mresult->count2 = count2;
4749 mresult->count3 = count3;
4750 mlevel1 = mresult->level1;
4751 mlevel2 = mresult->level23;
4752 mlevel3 = mresult->level23 + 16*count2;
4753 memcpy(mlevel1, level1, 32);
4754 memset(mlevel2, 0xFF, 16*count2);
4755 memset(mlevel3, 0, 128*count3);
4756 count3 = 0;
4757 for (i = 1; i < 256; i++) {
4758 int o1, o2, o3, i2, i3;
4759 if (decode[i] == 0xFFFE)
4760 /* unmapped character */
4761 continue;
4762 o1 = decode[i]>>11;
4763 o2 = (decode[i]>>7) & 0xF;
4764 i2 = 16*mlevel1[o1] + o2;
4765 if (mlevel2[i2] == 0xFF)
4766 mlevel2[i2] = count3++;
4767 o3 = decode[i] & 0x7F;
4768 i3 = 128*mlevel2[i2] + o3;
4769 mlevel3[i3] = i;
4770 }
4771 return result;
4772}
4773
4774static int
4775encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4776{
4777 struct encoding_map *map = (struct encoding_map*)mapping;
4778 int l1 = c>>11;
4779 int l2 = (c>>7) & 0xF;
4780 int l3 = c & 0x7F;
4781 int i;
4782
4783#ifdef Py_UNICODE_WIDE
4784 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004786 }
4787#endif
4788 if (c == 0)
4789 return 0;
4790 /* level 1*/
4791 i = map->level1[l1];
4792 if (i == 0xFF) {
4793 return -1;
4794 }
4795 /* level 2*/
4796 i = map->level23[16*i+l2];
4797 if (i == 0xFF) {
4798 return -1;
4799 }
4800 /* level 3 */
4801 i = map->level23[16*map->count2 + 128*i + l3];
4802 if (i == 0) {
4803 return -1;
4804 }
4805 return i;
4806}
4807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808/* Lookup the character ch in the mapping. If the character
4809 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004810 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812{
Christian Heimes217cfd12007-12-02 14:31:20 +00004813 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 PyObject *x;
4815
4816 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 x = PyObject_GetItem(mapping, w);
4819 Py_DECREF(w);
4820 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4822 /* No mapping found means: mapping is undefined. */
4823 PyErr_Clear();
4824 x = Py_None;
4825 Py_INCREF(x);
4826 return x;
4827 } else
4828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004830 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004832 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 long value = PyLong_AS_LONG(x);
4834 if (value < 0 || value > 255) {
4835 PyErr_SetString(PyExc_TypeError,
4836 "character mapping must be in range(256)");
4837 Py_DECREF(x);
4838 return NULL;
4839 }
4840 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 /* wrong return value */
4846 PyErr_Format(PyExc_TypeError,
4847 "character mapping must return integer, bytes or None, not %.400s",
4848 x->ob_type->tp_name);
4849 Py_DECREF(x);
4850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 }
4852}
4853
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004854static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004855charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004857 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4858 /* exponentially overallocate to minimize reallocations */
4859 if (requiredsize < 2*outsize)
4860 requiredsize = 2*outsize;
4861 if (_PyBytes_Resize(outobj, requiredsize))
4862 return -1;
4863 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004864}
4865
Benjamin Peterson14339b62009-01-31 16:36:08 +00004866typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004868}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004870 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 space is available. Return a new reference to the object that
4872 was put in the output buffer, or Py_None, if the mapping was undefined
4873 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004874 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004876charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004879 PyObject *rep;
4880 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004881 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882
Christian Heimes90aa7642007-12-19 02:45:37 +00004883 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004884 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886 if (res == -1)
4887 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (outsize<requiredsize)
4889 if (charmapencode_resize(outobj, outpos, requiredsize))
4890 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004891 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 outstart[(*outpos)++] = (char)res;
4893 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004894 }
4895
4896 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004899 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 Py_DECREF(rep);
4901 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004902 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 if (PyLong_Check(rep)) {
4904 Py_ssize_t requiredsize = *outpos+1;
4905 if (outsize<requiredsize)
4906 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4907 Py_DECREF(rep);
4908 return enc_EXCEPTION;
4909 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004910 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 else {
4914 const char *repchars = PyBytes_AS_STRING(rep);
4915 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4916 Py_ssize_t requiredsize = *outpos+repsize;
4917 if (outsize<requiredsize)
4918 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4919 Py_DECREF(rep);
4920 return enc_EXCEPTION;
4921 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004922 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 memcpy(outstart + *outpos, repchars, repsize);
4924 *outpos += repsize;
4925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004927 Py_DECREF(rep);
4928 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929}
4930
4931/* handle an error in PyUnicode_EncodeCharmap
4932 Return 0 on success, -1 on error */
4933static
4934int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004937 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004938 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939{
4940 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 Py_ssize_t repsize;
4942 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943 Py_UNICODE *uni2;
4944 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004945 Py_ssize_t collstartpos = *inpos;
4946 Py_ssize_t collendpos = *inpos+1;
4947 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 char *encoding = "charmap";
4949 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004950 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 /* find all unencodable characters */
4953 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004954 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004955 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 int res = encoding_map_lookup(p[collendpos], mapping);
4957 if (res != -1)
4958 break;
4959 ++collendpos;
4960 continue;
4961 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004962
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 rep = charmapencode_lookup(p[collendpos], mapping);
4964 if (rep==NULL)
4965 return -1;
4966 else if (rep!=Py_None) {
4967 Py_DECREF(rep);
4968 break;
4969 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004970 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 }
4973 /* cache callback name lookup
4974 * (if not done yet, i.e. it's the first error) */
4975 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 if ((errors==NULL) || (!strcmp(errors, "strict")))
4977 *known_errorHandler = 1;
4978 else if (!strcmp(errors, "replace"))
4979 *known_errorHandler = 2;
4980 else if (!strcmp(errors, "ignore"))
4981 *known_errorHandler = 3;
4982 else if (!strcmp(errors, "xmlcharrefreplace"))
4983 *known_errorHandler = 4;
4984 else
4985 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 }
4987 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004988 case 1: /* strict */
4989 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4990 return -1;
4991 case 2: /* replace */
4992 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 x = charmapencode_output('?', mapping, res, respos);
4994 if (x==enc_EXCEPTION) {
4995 return -1;
4996 }
4997 else if (x==enc_FAILED) {
4998 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4999 return -1;
5000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005001 }
5002 /* fall through */
5003 case 3: /* ignore */
5004 *inpos = collendpos;
5005 break;
5006 case 4: /* xmlcharrefreplace */
5007 /* generate replacement (temporarily (mis)uses p) */
5008 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 char buffer[2+29+1+1];
5010 char *cp;
5011 sprintf(buffer, "&#%d;", (int)p[collpos]);
5012 for (cp = buffer; *cp; ++cp) {
5013 x = charmapencode_output(*cp, mapping, res, respos);
5014 if (x==enc_EXCEPTION)
5015 return -1;
5016 else if (x==enc_FAILED) {
5017 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5018 return -1;
5019 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005020 }
5021 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005022 *inpos = collendpos;
5023 break;
5024 default:
5025 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 encoding, reason, p, size, exceptionObject,
5027 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005028 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 return -1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005030 /* generate replacement */
5031 repsize = PyUnicode_GET_SIZE(repunicode);
5032 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 x = charmapencode_output(*uni2, mapping, res, respos);
5034 if (x==enc_EXCEPTION) {
5035 return -1;
5036 }
5037 else if (x==enc_FAILED) {
5038 Py_DECREF(repunicode);
5039 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5040 return -1;
5041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005042 }
5043 *inpos = newpos;
5044 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 }
5046 return 0;
5047}
5048
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 Py_ssize_t size,
5051 PyObject *mapping,
5052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054 /* output object */
5055 PyObject *res = NULL;
5056 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005057 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 PyObject *errorHandler = NULL;
5061 PyObject *exc = NULL;
5062 /* the following variable is used for caching string comparisons
5063 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5064 * 3=ignore, 4=xmlcharrefreplace */
5065 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
5067 /* Default to Latin-1 */
5068 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 /* allocate enough for a simple encoding without
5072 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005073 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 if (res == NULL)
5075 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005076 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 /* try to encode it */
5081 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5082 if (x==enc_EXCEPTION) /* error */
5083 goto onError;
5084 if (x==enc_FAILED) { /* unencodable character */
5085 if (charmap_encoding_error(p, size, &inpos, mapping,
5086 &exc,
5087 &known_errorHandler, &errorHandler, errors,
5088 &res, &respos)) {
5089 goto onError;
5090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 else
5093 /* done with this character => adjust input position */
5094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005098 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005099 if (_PyBytes_Resize(&res, respos) < 0)
5100 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 Py_XDECREF(exc);
5103 Py_XDECREF(errorHandler);
5104 return res;
5105
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 Py_XDECREF(res);
5108 Py_XDECREF(exc);
5109 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 return NULL;
5111}
5112
5113PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115{
5116 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 PyErr_BadArgument();
5118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
5120 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 PyUnicode_GET_SIZE(unicode),
5122 mapping,
5123 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124}
5125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126/* create or adjust a UnicodeTranslateError */
5127static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 const Py_UNICODE *unicode, Py_ssize_t size,
5129 Py_ssize_t startpos, Py_ssize_t endpos,
5130 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
5136 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5138 goto onError;
5139 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5140 goto onError;
5141 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5142 goto onError;
5143 return;
5144 onError:
5145 Py_DECREF(*exceptionObject);
5146 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 }
5148}
5149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150/* raises a UnicodeTranslateError */
5151static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 const Py_UNICODE *unicode, Py_ssize_t size,
5153 Py_ssize_t startpos, Py_ssize_t endpos,
5154 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155{
5156 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160}
5161
5162/* error handling callback helper:
5163 build arguments, call the callback and check the arguments,
5164 put the result into newpos and return the replacement string, which
5165 has to be freed by the caller */
5166static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 PyObject **errorHandler,
5168 const char *reason,
5169 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5170 Py_ssize_t startpos, Py_ssize_t endpos,
5171 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005173 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005175 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176 PyObject *restuple;
5177 PyObject *resunicode;
5178
5179 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 }
5184
5185 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189
5190 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005195 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_DECREF(restuple);
5197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 }
5199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 &resunicode, &i_newpos)) {
5201 Py_DECREF(restuple);
5202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 else
5207 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005208 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5210 Py_DECREF(restuple);
5211 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005212 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 Py_INCREF(resunicode);
5214 Py_DECREF(restuple);
5215 return resunicode;
5216}
5217
5218/* Lookup the character ch in the mapping and put the result in result,
5219 which must be decrefed by the caller.
5220 Return 0 on success, -1 on error */
5221static
5222int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5223{
Christian Heimes217cfd12007-12-02 14:31:20 +00005224 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 PyObject *x;
5226
5227 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 x = PyObject_GetItem(mapping, w);
5230 Py_DECREF(w);
5231 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5233 /* No mapping found means: use 1:1 mapping. */
5234 PyErr_Clear();
5235 *result = NULL;
5236 return 0;
5237 } else
5238 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 }
5240 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 *result = x;
5242 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005244 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 long value = PyLong_AS_LONG(x);
5246 long max = PyUnicode_GetMax();
5247 if (value < 0 || value > max) {
5248 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005249 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 Py_DECREF(x);
5251 return -1;
5252 }
5253 *result = x;
5254 return 0;
5255 }
5256 else if (PyUnicode_Check(x)) {
5257 *result = x;
5258 return 0;
5259 }
5260 else {
5261 /* wrong return value */
5262 PyErr_SetString(PyExc_TypeError,
5263 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005264 Py_DECREF(x);
5265 return -1;
5266 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267}
5268/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 if not reallocate and adjust various state variables.
5270 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271static
Walter Dörwald4894c302003-10-24 14:25:28 +00005272int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005275 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005276 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 /* remember old output position */
5278 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5279 /* exponentially overallocate to minimize reallocations */
5280 if (requiredsize < 2 * oldsize)
5281 requiredsize = 2 * oldsize;
5282 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5283 return -1;
5284 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 }
5286 return 0;
5287}
5288/* lookup the character, put the result in the output string and adjust
5289 various state variables. Return a new reference to the object that
5290 was put in the output buffer in *result, or Py_None, if the mapping was
5291 undefined (in which case no character was written).
5292 The called must decref result.
5293 Return 0 on success, -1 on error. */
5294static
Walter Dörwald4894c302003-10-24 14:25:28 +00005295int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5297 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298{
Walter Dörwald4894c302003-10-24 14:25:28 +00005299 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 /* not found => default to 1:1 mapping */
5303 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 }
5305 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005307 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 /* no overflow check, because we know that the space is enough */
5309 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 }
5311 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5313 if (repsize==1) {
5314 /* no overflow check, because we know that the space is enough */
5315 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5316 }
5317 else if (repsize!=0) {
5318 /* more than one character */
5319 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5320 (insize - (curinp-startinp)) +
5321 repsize - 1;
5322 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5323 return -1;
5324 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5325 *outp += repsize;
5326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327 }
5328 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 return 0;
5331}
5332
5333PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 Py_ssize_t size,
5335 PyObject *mapping,
5336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 /* output object */
5339 PyObject *res = NULL;
5340 /* pointers to the beginning and end+1 of input */
5341 const Py_UNICODE *startp = p;
5342 const Py_UNICODE *endp = p + size;
5343 /* pointer into the output */
5344 Py_UNICODE *str;
5345 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 char *reason = "character maps to <undefined>";
5348 PyObject *errorHandler = NULL;
5349 PyObject *exc = NULL;
5350 /* the following variable is used for caching string comparisons
5351 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5352 * 3=ignore, 4=xmlcharrefreplace */
5353 int known_errorHandler = -1;
5354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 PyErr_BadArgument();
5357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359
5360 /* allocate enough for a simple 1:1 translation without
5361 replacements, if we need more, we'll resize */
5362 res = PyUnicode_FromUnicode(NULL, size);
5363 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 /* try to encode it */
5371 PyObject *x = NULL;
5372 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5373 Py_XDECREF(x);
5374 goto onError;
5375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005376 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 if (x!=Py_None) /* it worked => adjust input pointer */
5378 ++p;
5379 else { /* untranslatable character */
5380 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5381 Py_ssize_t repsize;
5382 Py_ssize_t newpos;
5383 Py_UNICODE *uni2;
5384 /* startpos for collecting untranslatable chars */
5385 const Py_UNICODE *collstart = p;
5386 const Py_UNICODE *collend = p+1;
5387 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 /* find all untranslatable characters */
5390 while (collend < endp) {
5391 if (charmaptranslate_lookup(*collend, mapping, &x))
5392 goto onError;
5393 Py_XDECREF(x);
5394 if (x!=Py_None)
5395 break;
5396 ++collend;
5397 }
5398 /* cache callback name lookup
5399 * (if not done yet, i.e. it's the first error) */
5400 if (known_errorHandler==-1) {
5401 if ((errors==NULL) || (!strcmp(errors, "strict")))
5402 known_errorHandler = 1;
5403 else if (!strcmp(errors, "replace"))
5404 known_errorHandler = 2;
5405 else if (!strcmp(errors, "ignore"))
5406 known_errorHandler = 3;
5407 else if (!strcmp(errors, "xmlcharrefreplace"))
5408 known_errorHandler = 4;
5409 else
5410 known_errorHandler = 0;
5411 }
5412 switch (known_errorHandler) {
5413 case 1: /* strict */
5414 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 case 2: /* replace */
5417 /* No need to check for space, this is a 1:1 replacement */
5418 for (coll = collstart; coll<collend; ++coll)
5419 *str++ = '?';
5420 /* fall through */
5421 case 3: /* ignore */
5422 p = collend;
5423 break;
5424 case 4: /* xmlcharrefreplace */
5425 /* generate replacement (temporarily (mis)uses p) */
5426 for (p = collstart; p < collend; ++p) {
5427 char buffer[2+29+1+1];
5428 char *cp;
5429 sprintf(buffer, "&#%d;", (int)*p);
5430 if (charmaptranslate_makespace(&res, &str,
5431 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5432 goto onError;
5433 for (cp = buffer; *cp; ++cp)
5434 *str++ = *cp;
5435 }
5436 p = collend;
5437 break;
5438 default:
5439 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5440 reason, startp, size, &exc,
5441 collstart-startp, collend-startp, &newpos);
5442 if (repunicode == NULL)
5443 goto onError;
5444 /* generate replacement */
5445 repsize = PyUnicode_GET_SIZE(repunicode);
5446 if (charmaptranslate_makespace(&res, &str,
5447 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5448 Py_DECREF(repunicode);
5449 goto onError;
5450 }
5451 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5452 *str++ = *uni2;
5453 p = startp + newpos;
5454 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005456 }
5457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 /* Resize if we allocated to much */
5459 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005460 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 if (PyUnicode_Resize(&res, respos) < 0)
5462 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 }
5464 Py_XDECREF(exc);
5465 Py_XDECREF(errorHandler);
5466 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 Py_XDECREF(res);
5470 Py_XDECREF(exc);
5471 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 return NULL;
5473}
5474
5475PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 PyObject *mapping,
5477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
5479 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 str = PyUnicode_FromObject(str);
5482 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 PyUnicode_GET_SIZE(str),
5486 mapping,
5487 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 Py_DECREF(str);
5489 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 Py_XDECREF(str);
5493 return NULL;
5494}
Tim Petersced69f82003-09-16 20:30:58 +00005495
Guido van Rossum9e896b32000-04-05 20:11:21 +00005496/* --- Decimal Encoder ---------------------------------------------------- */
5497
5498int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 Py_ssize_t length,
5500 char *output,
5501 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005502{
5503 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 PyObject *errorHandler = NULL;
5505 PyObject *exc = NULL;
5506 const char *encoding = "decimal";
5507 const char *reason = "invalid decimal Unicode string";
5508 /* the following variable is used for caching string comparisons
5509 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5510 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005511
5512 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 PyErr_BadArgument();
5514 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005515 }
5516
5517 p = s;
5518 end = s + length;
5519 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 register Py_UNICODE ch = *p;
5521 int decimal;
5522 PyObject *repunicode;
5523 Py_ssize_t repsize;
5524 Py_ssize_t newpos;
5525 Py_UNICODE *uni2;
5526 Py_UNICODE *collstart;
5527 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005528
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005530 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 ++p;
5532 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 decimal = Py_UNICODE_TODECIMAL(ch);
5535 if (decimal >= 0) {
5536 *output++ = '0' + decimal;
5537 ++p;
5538 continue;
5539 }
5540 if (0 < ch && ch < 256) {
5541 *output++ = (char)ch;
5542 ++p;
5543 continue;
5544 }
5545 /* All other characters are considered unencodable */
5546 collstart = p;
5547 collend = p+1;
5548 while (collend < end) {
5549 if ((0 < *collend && *collend < 256) ||
5550 !Py_UNICODE_ISSPACE(*collend) ||
5551 Py_UNICODE_TODECIMAL(*collend))
5552 break;
5553 }
5554 /* cache callback name lookup
5555 * (if not done yet, i.e. it's the first error) */
5556 if (known_errorHandler==-1) {
5557 if ((errors==NULL) || (!strcmp(errors, "strict")))
5558 known_errorHandler = 1;
5559 else if (!strcmp(errors, "replace"))
5560 known_errorHandler = 2;
5561 else if (!strcmp(errors, "ignore"))
5562 known_errorHandler = 3;
5563 else if (!strcmp(errors, "xmlcharrefreplace"))
5564 known_errorHandler = 4;
5565 else
5566 known_errorHandler = 0;
5567 }
5568 switch (known_errorHandler) {
5569 case 1: /* strict */
5570 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5571 goto onError;
5572 case 2: /* replace */
5573 for (p = collstart; p < collend; ++p)
5574 *output++ = '?';
5575 /* fall through */
5576 case 3: /* ignore */
5577 p = collend;
5578 break;
5579 case 4: /* xmlcharrefreplace */
5580 /* generate replacement (temporarily (mis)uses p) */
5581 for (p = collstart; p < collend; ++p)
5582 output += sprintf(output, "&#%d;", (int)*p);
5583 p = collend;
5584 break;
5585 default:
5586 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5587 encoding, reason, s, length, &exc,
5588 collstart-s, collend-s, &newpos);
5589 if (repunicode == NULL)
5590 goto onError;
5591 /* generate replacement */
5592 repsize = PyUnicode_GET_SIZE(repunicode);
5593 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5594 Py_UNICODE ch = *uni2;
5595 if (Py_UNICODE_ISSPACE(ch))
5596 *output++ = ' ';
5597 else {
5598 decimal = Py_UNICODE_TODECIMAL(ch);
5599 if (decimal >= 0)
5600 *output++ = '0' + decimal;
5601 else if (0 < ch && ch < 256)
5602 *output++ = (char)ch;
5603 else {
5604 Py_DECREF(repunicode);
5605 raise_encode_exception(&exc, encoding,
5606 s, length, collstart-s, collend-s, reason);
5607 goto onError;
5608 }
5609 }
5610 }
5611 p = s + newpos;
5612 Py_DECREF(repunicode);
5613 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005614 }
5615 /* 0-terminate the output string */
5616 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 Py_XDECREF(exc);
5618 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005619 return 0;
5620
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 Py_XDECREF(exc);
5623 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005624 return -1;
5625}
5626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627/* --- Helpers ------------------------------------------------------------ */
5628
Eric Smith8c663262007-08-25 02:26:07 +00005629#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005630#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005631#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005632/* Include _ParseTupleFinds from find.h */
5633#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005634#include "stringlib/find.h"
5635#include "stringlib/partition.h"
5636
Eric Smith5807c412008-05-11 21:00:57 +00005637#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005638#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005639#include "stringlib/localeutil.h"
5640
Thomas Wouters477c8d52006-05-27 19:21:47 +00005641/* helper macro to fixup start/end slice values */
5642#define FIX_START_END(obj) \
5643 if (start < 0) \
5644 start += (obj)->length; \
5645 if (start < 0) \
5646 start = 0; \
5647 if (end > (obj)->length) \
5648 end = (obj)->length; \
5649 if (end < 0) \
5650 end += (obj)->length; \
5651 if (end < 0) \
5652 end = 0;
5653
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 PyObject *substr,
5656 Py_ssize_t start,
5657 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005659 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005660 PyUnicodeObject* str_obj;
5661 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005662
Thomas Wouters477c8d52006-05-27 19:21:47 +00005663 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5664 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005666 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5667 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 Py_DECREF(str_obj);
5669 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Tim Petersced69f82003-09-16 20:30:58 +00005671
Thomas Wouters477c8d52006-05-27 19:21:47 +00005672 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005673
Thomas Wouters477c8d52006-05-27 19:21:47 +00005674 result = stringlib_count(
5675 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5676 );
5677
5678 Py_DECREF(sub_obj);
5679 Py_DECREF(str_obj);
5680
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return result;
5682}
5683
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005685 PyObject *sub,
5686 Py_ssize_t start,
5687 Py_ssize_t end,
5688 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005690 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005691
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005693 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005695 sub = PyUnicode_FromObject(sub);
5696 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 Py_DECREF(str);
5698 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
Tim Petersced69f82003-09-16 20:30:58 +00005700
Thomas Wouters477c8d52006-05-27 19:21:47 +00005701 if (direction > 0)
5702 result = stringlib_find_slice(
5703 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5704 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5705 start, end
5706 );
5707 else
5708 result = stringlib_rfind_slice(
5709 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5710 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5711 start, end
5712 );
5713
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005715 Py_DECREF(sub);
5716
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 return result;
5718}
5719
Tim Petersced69f82003-09-16 20:30:58 +00005720static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 PyUnicodeObject *substring,
5723 Py_ssize_t start,
5724 Py_ssize_t end,
5725 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 if (substring->length == 0)
5728 return 1;
5729
Thomas Wouters477c8d52006-05-27 19:21:47 +00005730 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
5732 end -= substring->length;
5733 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 if (Py_UNICODE_MATCH(self, end, substring))
5738 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 } else {
5740 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 }
5743
5744 return 0;
5745}
5746
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 PyObject *substr,
5749 Py_ssize_t start,
5750 Py_ssize_t end,
5751 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005754
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 str = PyUnicode_FromObject(str);
5756 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 substr = PyUnicode_FromObject(substr);
5759 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 Py_DECREF(str);
5761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 }
Tim Petersced69f82003-09-16 20:30:58 +00005763
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 (PyUnicodeObject *)substr,
5766 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 Py_DECREF(str);
5768 Py_DECREF(substr);
5769 return result;
5770}
5771
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772/* Apply fixfct filter to the Unicode object self and return a
5773 reference to the modified object */
5774
Tim Petersced69f82003-09-16 20:30:58 +00005775static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
5779
5780 PyUnicodeObject *u;
5781
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005782 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005785
5786 Py_UNICODE_COPY(u->str, self->str, self->length);
5787
Tim Peters7a29bd52001-09-12 03:03:31 +00005788 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 /* fixfct should return TRUE if it modified the buffer. If
5790 FALSE, return a reference to the original buffer instead
5791 (to save space, not time) */
5792 Py_INCREF(self);
5793 Py_DECREF(u);
5794 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 }
5796 return (PyObject*) u;
5797}
5798
Tim Petersced69f82003-09-16 20:30:58 +00005799static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800int fixupper(PyUnicodeObject *self)
5801{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005802 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 Py_UNICODE *s = self->str;
5804 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005805
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005808
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 ch = Py_UNICODE_TOUPPER(*s);
5810 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 *s = ch;
5813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 s++;
5815 }
5816
5817 return status;
5818}
5819
Tim Petersced69f82003-09-16 20:30:58 +00005820static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821int fixlower(PyUnicodeObject *self)
5822{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 Py_UNICODE *s = self->str;
5825 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005829
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 ch = Py_UNICODE_TOLOWER(*s);
5831 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 *s = ch;
5834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 s++;
5836 }
5837
5838 return status;
5839}
5840
Tim Petersced69f82003-09-16 20:30:58 +00005841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842int fixswapcase(PyUnicodeObject *self)
5843{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005844 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 Py_UNICODE *s = self->str;
5846 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005847
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 while (len-- > 0) {
5849 if (Py_UNICODE_ISUPPER(*s)) {
5850 *s = Py_UNICODE_TOLOWER(*s);
5851 status = 1;
5852 } else if (Py_UNICODE_ISLOWER(*s)) {
5853 *s = Py_UNICODE_TOUPPER(*s);
5854 status = 1;
5855 }
5856 s++;
5857 }
5858
5859 return status;
5860}
5861
Tim Petersced69f82003-09-16 20:30:58 +00005862static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863int fixcapitalize(PyUnicodeObject *self)
5864{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005866 Py_UNICODE *s = self->str;
5867 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005868
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005869 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005871 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *s = Py_UNICODE_TOUPPER(*s);
5873 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005875 s++;
5876 while (--len > 0) {
5877 if (Py_UNICODE_ISUPPER(*s)) {
5878 *s = Py_UNICODE_TOLOWER(*s);
5879 status = 1;
5880 }
5881 s++;
5882 }
5883 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
5886static
5887int fixtitle(PyUnicodeObject *self)
5888{
5889 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5890 register Py_UNICODE *e;
5891 int previous_is_cased;
5892
5893 /* Shortcut for single character strings */
5894 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5896 if (*p != ch) {
5897 *p = ch;
5898 return 1;
5899 }
5900 else
5901 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
Tim Petersced69f82003-09-16 20:30:58 +00005903
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 e = p + PyUnicode_GET_SIZE(self);
5905 previous_is_cased = 0;
5906 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005908
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 if (previous_is_cased)
5910 *p = Py_UNICODE_TOLOWER(ch);
5911 else
5912 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 if (Py_UNICODE_ISLOWER(ch) ||
5915 Py_UNICODE_ISUPPER(ch) ||
5916 Py_UNICODE_ISTITLE(ch))
5917 previous_is_cased = 1;
5918 else
5919 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 }
5921 return 1;
5922}
5923
Tim Peters8ce9f162004-08-27 01:49:32 +00005924PyObject *
5925PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
Skip Montanaro6543b452004-09-16 03:28:13 +00005927 const Py_UNICODE blank = ' ';
5928 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005930 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005931 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5932 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005933 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5934 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005935 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005936 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Tim Peters05eba1f2004-08-27 21:32:02 +00005938 fseq = PySequence_Fast(seq, "");
5939 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005940 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005941 }
5942
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005943 /* NOTE: the following code can't call back into Python code,
5944 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005945 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005946
Tim Peters05eba1f2004-08-27 21:32:02 +00005947 seqlen = PySequence_Fast_GET_SIZE(fseq);
5948 /* If empty sequence, return u"". */
5949 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005950 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5951 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005952 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005953 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005954 /* If singleton sequence with an exact Unicode, return that. */
5955 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 item = items[0];
5957 if (PyUnicode_CheckExact(item)) {
5958 Py_INCREF(item);
5959 res = (PyUnicodeObject *)item;
5960 goto Done;
5961 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005962 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005963 else {
5964 /* Set up sep and seplen */
5965 if (separator == NULL) {
5966 sep = &blank;
5967 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005968 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005969 else {
5970 if (!PyUnicode_Check(separator)) {
5971 PyErr_Format(PyExc_TypeError,
5972 "separator: expected str instance,"
5973 " %.80s found",
5974 Py_TYPE(separator)->tp_name);
5975 goto onError;
5976 }
5977 sep = PyUnicode_AS_UNICODE(separator);
5978 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005979 }
5980 }
5981
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005982 /* There are at least two things to join, or else we have a subclass
5983 * of str in the sequence.
5984 * Do a pre-pass to figure out the total amount of space we'll
5985 * need (sz), and see whether all argument are strings.
5986 */
5987 sz = 0;
5988 for (i = 0; i < seqlen; i++) {
5989 const Py_ssize_t old_sz = sz;
5990 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 if (!PyUnicode_Check(item)) {
5992 PyErr_Format(PyExc_TypeError,
5993 "sequence item %zd: expected str instance,"
5994 " %.80s found",
5995 i, Py_TYPE(item)->tp_name);
5996 goto onError;
5997 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005998 sz += PyUnicode_GET_SIZE(item);
5999 if (i != 0)
6000 sz += seplen;
6001 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6002 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006004 goto onError;
6005 }
6006 }
Tim Petersced69f82003-09-16 20:30:58 +00006007
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006008 res = _PyUnicode_New(sz);
6009 if (res == NULL)
6010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006011
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006012 /* Catenate everything. */
6013 res_p = PyUnicode_AS_UNICODE(res);
6014 for (i = 0; i < seqlen; ++i) {
6015 Py_ssize_t itemlen;
6016 item = items[i];
6017 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* Copy item, and maybe the separator. */
6019 if (i) {
6020 Py_UNICODE_COPY(res_p, sep, seplen);
6021 res_p += seplen;
6022 }
6023 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6024 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006025 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006028 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return (PyObject *)res;
6030
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006032 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006033 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 return NULL;
6035}
6036
Tim Petersced69f82003-09-16 20:30:58 +00006037static
6038PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 Py_ssize_t left,
6040 Py_ssize_t right,
6041 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042{
6043 PyUnicodeObject *u;
6044
6045 if (left < 0)
6046 left = 0;
6047 if (right < 0)
6048 right = 0;
6049
Tim Peters7a29bd52001-09-12 03:03:31 +00006050 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 Py_INCREF(self);
6052 return self;
6053 }
6054
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006055 if (left > PY_SSIZE_T_MAX - self->length ||
6056 right > PY_SSIZE_T_MAX - (left + self->length)) {
6057 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6058 return NULL;
6059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 u = _PyUnicode_New(left + self->length + right);
6061 if (u) {
6062 if (left)
6063 Py_UNICODE_FILL(u->str, fill, left);
6064 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6065 if (right)
6066 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6067 }
6068
6069 return u;
6070}
6071
Benjamin Peterson29060642009-01-31 22:14:21 +00006072#define SPLIT_APPEND(data, left, right) \
6073 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6074 if (!str) \
6075 goto onError; \
6076 if (PyList_Append(list, str)) { \
6077 Py_DECREF(str); \
6078 goto onError; \
6079 } \
6080 else \
6081 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
6083static
6084PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 PyObject *list,
6086 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 register Py_ssize_t i;
6089 register Py_ssize_t j;
6090 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006092 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
6094 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006096 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006098 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6100 i++;
6101 if (j < i) {
6102 if (maxcount-- <= 0)
6103 break;
6104 SPLIT_APPEND(buf, j, i);
6105 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6106 i++;
6107 j = i;
6108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
6110 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
6113 return list;
6114
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 Py_DECREF(list);
6117 return NULL;
6118}
6119
6120PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 register Py_ssize_t i;
6124 register Py_ssize_t j;
6125 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 PyObject *list;
6127 PyObject *str;
6128 Py_UNICODE *data;
6129
6130 string = PyUnicode_FromObject(string);
6131 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 data = PyUnicode_AS_UNICODE(string);
6134 len = PyUnicode_GET_SIZE(string);
6135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 list = PyList_New(0);
6137 if (!list)
6138 goto onError;
6139
6140 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006142
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 /* Find a line and append it */
6144 while (i < len && !BLOOM_LINEBREAK(data[i]))
6145 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006148 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 if (i < len) {
6150 if (data[i] == '\r' && i + 1 < len &&
6151 data[i+1] == '\n')
6152 i += 2;
6153 else
6154 i++;
6155 if (keepends)
6156 eol = i;
6157 }
6158 SPLIT_APPEND(data, j, eol);
6159 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
6161 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
6164
6165 Py_DECREF(string);
6166 return list;
6167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006169 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_DECREF(string);
6171 return NULL;
6172}
6173
Tim Petersced69f82003-09-16 20:30:58 +00006174static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 PyObject *list,
6177 Py_UNICODE ch,
6178 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006180 register Py_ssize_t i;
6181 register Py_ssize_t j;
6182 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006184 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 if (buf[i] == ch) {
6188 if (maxcount-- <= 0)
6189 break;
6190 SPLIT_APPEND(buf, j, i);
6191 i = j = i + 1;
6192 } else
6193 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 }
6195 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 }
6198 return list;
6199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 Py_DECREF(list);
6202 return NULL;
6203}
6204
Tim Petersced69f82003-09-16 20:30:58 +00006205static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 PyObject *list,
6208 PyUnicodeObject *substring,
6209 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006211 register Py_ssize_t i;
6212 register Py_ssize_t j;
6213 Py_ssize_t len = self->length;
6214 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 PyObject *str;
6216
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006217 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 if (Py_UNICODE_MATCH(self, i, substring)) {
6219 if (maxcount-- <= 0)
6220 break;
6221 SPLIT_APPEND(self->str, j, i);
6222 i = j = i + sublen;
6223 } else
6224 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
6226 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 }
6229 return list;
6230
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_DECREF(list);
6233 return NULL;
6234}
6235
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006236static
6237PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 PyObject *list,
6239 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 register Py_ssize_t i;
6242 register Py_ssize_t j;
6243 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006244 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006245 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006246
6247 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006249 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006251 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6253 i--;
6254 if (j > i) {
6255 if (maxcount-- <= 0)
6256 break;
6257 SPLIT_APPEND(buf, i + 1, j + 1);
6258 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6259 i--;
6260 j = i;
6261 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006262 }
6263 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006265 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006266 if (PyList_Reverse(list) < 0)
6267 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006268 return list;
6269
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006271 Py_DECREF(list);
6272 return NULL;
6273}
6274
Benjamin Peterson14339b62009-01-31 16:36:08 +00006275static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006276PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 PyObject *list,
6278 Py_UNICODE ch,
6279 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 register Py_ssize_t i;
6282 register Py_ssize_t j;
6283 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006284 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006285 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006286
6287 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 if (buf[i] == ch) {
6289 if (maxcount-- <= 0)
6290 break;
6291 SPLIT_APPEND(buf, i + 1, j + 1);
6292 j = i = i - 1;
6293 } else
6294 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006295 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006296 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006298 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299 if (PyList_Reverse(list) < 0)
6300 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006301 return list;
6302
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006304 Py_DECREF(list);
6305 return NULL;
6306}
6307
Benjamin Peterson14339b62009-01-31 16:36:08 +00006308static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006309PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyObject *list,
6311 PyUnicodeObject *substring,
6312 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006314 register Py_ssize_t i;
6315 register Py_ssize_t j;
6316 Py_ssize_t len = self->length;
6317 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006318 PyObject *str;
6319
6320 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 if (Py_UNICODE_MATCH(self, i, substring)) {
6322 if (maxcount-- <= 0)
6323 break;
6324 SPLIT_APPEND(self->str, i + sublen, j);
6325 j = i;
6326 i -= sublen;
6327 } else
6328 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006329 }
6330 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006332 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006333 if (PyList_Reverse(list) < 0)
6334 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006335 return list;
6336
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006338 Py_DECREF(list);
6339 return NULL;
6340}
6341
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342#undef SPLIT_APPEND
6343
6344static
6345PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 PyUnicodeObject *substring,
6347 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348{
6349 PyObject *list;
6350
6351 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006352 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354 list = PyList_New(0);
6355 if (!list)
6356 return NULL;
6357
6358 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
6361 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 Py_DECREF(list);
6366 PyErr_SetString(PyExc_ValueError, "empty separator");
6367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
6369 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371}
6372
Tim Petersced69f82003-09-16 20:30:58 +00006373static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006374PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 PyUnicodeObject *substring,
6376 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006377{
6378 PyObject *list;
6379
6380 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006381 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006382
6383 list = PyList_New(0);
6384 if (!list)
6385 return NULL;
6386
6387 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006389
6390 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006392
6393 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 Py_DECREF(list);
6395 PyErr_SetString(PyExc_ValueError, "empty separator");
6396 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006397 }
6398 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006400}
6401
6402static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 PyUnicodeObject *str1,
6405 PyUnicodeObject *str2,
6406 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
6408 PyUnicodeObject *u;
6409
6410 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
Thomas Wouters477c8d52006-05-27 19:21:47 +00006413 if (str1->length == str2->length) {
6414 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006415 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006416 if (str1->length == 1) {
6417 /* replace characters */
6418 Py_UNICODE u1, u2;
6419 if (!findchar(self->str, self->length, str1->str[0]))
6420 goto nothing;
6421 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6422 if (!u)
6423 return NULL;
6424 Py_UNICODE_COPY(u->str, self->str, self->length);
6425 u1 = str1->str[0];
6426 u2 = str2->str[0];
6427 for (i = 0; i < u->length; i++)
6428 if (u->str[i] == u1) {
6429 if (--maxcount < 0)
6430 break;
6431 u->str[i] = u2;
6432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434 i = fastsearch(
6435 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006437 if (i < 0)
6438 goto nothing;
6439 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6440 if (!u)
6441 return NULL;
6442 Py_UNICODE_COPY(u->str, self->str, self->length);
6443 while (i <= self->length - str1->length)
6444 if (Py_UNICODE_MATCH(self, i, str1)) {
6445 if (--maxcount < 0)
6446 break;
6447 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6448 i += str1->length;
6449 } else
6450 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006453
6454 Py_ssize_t n, i, j, e;
6455 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 Py_UNICODE *p;
6457
6458 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006459 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 if (n > maxcount)
6461 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006462 if (n == 0)
6463 goto nothing;
6464 /* new_size = self->length + n * (str2->length - str1->length)); */
6465 delta = (str2->length - str1->length);
6466 if (delta == 0) {
6467 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 product = n * (str2->length - str1->length);
6470 if ((product / (str2->length - str1->length)) != n) {
6471 PyErr_SetString(PyExc_OverflowError,
6472 "replace string is too long");
6473 return NULL;
6474 }
6475 new_size = self->length + product;
6476 if (new_size < 0) {
6477 PyErr_SetString(PyExc_OverflowError,
6478 "replace string is too long");
6479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
6481 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482 u = _PyUnicode_New(new_size);
6483 if (!u)
6484 return NULL;
6485 i = 0;
6486 p = u->str;
6487 e = self->length - str1->length;
6488 if (str1->length > 0) {
6489 while (n-- > 0) {
6490 /* look for next match */
6491 j = i;
6492 while (j <= e) {
6493 if (Py_UNICODE_MATCH(self, j, str1))
6494 break;
6495 j++;
6496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006498 if (j > e)
6499 break;
6500 /* copy unchanged part [i:j] */
6501 Py_UNICODE_COPY(p, self->str+i, j-i);
6502 p += j - i;
6503 }
6504 /* copy substitution string */
6505 if (str2->length > 0) {
6506 Py_UNICODE_COPY(p, str2->str, str2->length);
6507 p += str2->length;
6508 }
6509 i = j + str1->length;
6510 }
6511 if (i < self->length)
6512 /* copy tail [i:] */
6513 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6514 } else {
6515 /* interleave */
6516 while (n > 0) {
6517 Py_UNICODE_COPY(p, str2->str, str2->length);
6518 p += str2->length;
6519 if (--n <= 0)
6520 break;
6521 *p++ = self->str[i++];
6522 }
6523 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 /* nothing to replace; return original string (when possible) */
6530 if (PyUnicode_CheckExact(self)) {
6531 Py_INCREF(self);
6532 return (PyObject *) self;
6533 }
6534 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
6537/* --- Unicode Object Methods --------------------------------------------- */
6538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006539PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541\n\
6542Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
6545static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006546unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return fixup(self, fixtitle);
6549}
6550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553\n\
6554Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006555have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
6557static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006558unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 return fixup(self, fixcapitalize);
6561}
6562
6563#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566\n\
6567Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006568normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
6570static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006571unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572{
6573 PyObject *list;
6574 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 /* Split into words */
6578 list = split(self, NULL, -1);
6579 if (!list)
6580 return NULL;
6581
6582 /* Capitalize each word */
6583 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6584 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 if (item == NULL)
6587 goto onError;
6588 Py_DECREF(PyList_GET_ITEM(list, i));
6589 PyList_SET_ITEM(list, i, item);
6590 }
6591
6592 /* Join the words to form a new string */
6593 item = PyUnicode_Join(NULL, list);
6594
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 Py_DECREF(list);
6597 return (PyObject *)item;
6598}
6599#endif
6600
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006601/* Argument converter. Coerces to a single unicode character */
6602
6603static int
6604convert_uc(PyObject *obj, void *addr)
6605{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6607 PyObject *uniobj;
6608 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006609
Benjamin Peterson14339b62009-01-31 16:36:08 +00006610 uniobj = PyUnicode_FromObject(obj);
6611 if (uniobj == NULL) {
6612 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 return 0;
6615 }
6616 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6617 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 Py_DECREF(uniobj);
6620 return 0;
6621 }
6622 unistr = PyUnicode_AS_UNICODE(uniobj);
6623 *fillcharloc = unistr[0];
6624 Py_DECREF(uniobj);
6625 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006626}
6627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006628PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006631Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006632done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
6634static PyObject *
6635unicode_center(PyUnicodeObject *self, PyObject *args)
6636{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637 Py_ssize_t marg, left;
6638 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006639 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
Thomas Woutersde017742006-02-16 19:34:37 +00006641 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 return NULL;
6643
Tim Peters7a29bd52001-09-12 03:03:31 +00006644 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 Py_INCREF(self);
6646 return (PyObject*) self;
6647 }
6648
6649 marg = width - self->length;
6650 left = marg / 2 + (marg & width & 1);
6651
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006652 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653}
6654
Marc-André Lemburge5034372000-08-08 08:04:29 +00006655#if 0
6656
6657/* This code should go into some future Unicode collation support
6658 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006659 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006660
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006661/* speedy UTF-16 code point order comparison */
6662/* gleaned from: */
6663/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6664
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006665static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006666{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006667 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006668 0, 0, 0, 0, 0, 0, 0, 0,
6669 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006670 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006671};
6672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673static int
6674unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6675{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006676 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 Py_UNICODE *s1 = str1->str;
6679 Py_UNICODE *s2 = str2->str;
6680
6681 len1 = str1->length;
6682 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006685 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006686
6687 c1 = *s1++;
6688 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006689
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 if (c1 > (1<<11) * 26)
6691 c1 += utf16Fixup[c1>>11];
6692 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006693 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006694 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006695
6696 if (c1 != c2)
6697 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006698
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006699 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 }
6701
6702 return (len1 < len2) ? -1 : (len1 != len2);
6703}
6704
Marc-André Lemburge5034372000-08-08 08:04:29 +00006705#else
6706
6707static int
6708unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006710 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006711
6712 Py_UNICODE *s1 = str1->str;
6713 Py_UNICODE *s2 = str2->str;
6714
6715 len1 = str1->length;
6716 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006717
Marc-André Lemburge5034372000-08-08 08:04:29 +00006718 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006719 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006720
Fredrik Lundh45714e92001-06-26 16:39:36 +00006721 c1 = *s1++;
6722 c2 = *s2++;
6723
6724 if (c1 != c2)
6725 return (c1 < c2) ? -1 : 1;
6726
Marc-André Lemburge5034372000-08-08 08:04:29 +00006727 len1--; len2--;
6728 }
6729
6730 return (len1 < len2) ? -1 : (len1 != len2);
6731}
6732
6733#endif
6734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006738 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6739 return unicode_compare((PyUnicodeObject *)left,
6740 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006741 PyErr_Format(PyExc_TypeError,
6742 "Can't compare %.100s and %.100s",
6743 left->ob_type->tp_name,
6744 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 return -1;
6746}
6747
Martin v. Löwis5b222132007-06-10 09:51:05 +00006748int
6749PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6750{
6751 int i;
6752 Py_UNICODE *id;
6753 assert(PyUnicode_Check(uni));
6754 id = PyUnicode_AS_UNICODE(uni);
6755 /* Compare Unicode string and source character set string */
6756 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 if (id[i] != str[i])
6758 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006759 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006761 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006763 return 0;
6764}
6765
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006766
Benjamin Peterson29060642009-01-31 22:14:21 +00006767#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006768 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006769
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006770PyObject *PyUnicode_RichCompare(PyObject *left,
6771 PyObject *right,
6772 int op)
6773{
6774 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006776 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6777 PyObject *v;
6778 if (((PyUnicodeObject *) left)->length !=
6779 ((PyUnicodeObject *) right)->length) {
6780 if (op == Py_EQ) {
6781 Py_INCREF(Py_False);
6782 return Py_False;
6783 }
6784 if (op == Py_NE) {
6785 Py_INCREF(Py_True);
6786 return Py_True;
6787 }
6788 }
6789 if (left == right)
6790 result = 0;
6791 else
6792 result = unicode_compare((PyUnicodeObject *)left,
6793 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006794
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006795 /* Convert the return value to a Boolean */
6796 switch (op) {
6797 case Py_EQ:
6798 v = TEST_COND(result == 0);
6799 break;
6800 case Py_NE:
6801 v = TEST_COND(result != 0);
6802 break;
6803 case Py_LE:
6804 v = TEST_COND(result <= 0);
6805 break;
6806 case Py_GE:
6807 v = TEST_COND(result >= 0);
6808 break;
6809 case Py_LT:
6810 v = TEST_COND(result == -1);
6811 break;
6812 case Py_GT:
6813 v = TEST_COND(result == 1);
6814 break;
6815 default:
6816 PyErr_BadArgument();
6817 return NULL;
6818 }
6819 Py_INCREF(v);
6820 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006822
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006823 Py_INCREF(Py_NotImplemented);
6824 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006825}
6826
Guido van Rossum403d68b2000-03-13 15:55:09 +00006827int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006829{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006830 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006831 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006832
6833 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006834 sub = PyUnicode_FromObject(element);
6835 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 PyErr_Format(PyExc_TypeError,
6837 "'in <string>' requires string as left operand, not %s",
6838 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006839 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006840 }
6841
Thomas Wouters477c8d52006-05-27 19:21:47 +00006842 str = PyUnicode_FromObject(container);
6843 if (!str) {
6844 Py_DECREF(sub);
6845 return -1;
6846 }
6847
6848 result = stringlib_contains_obj(str, sub);
6849
6850 Py_DECREF(str);
6851 Py_DECREF(sub);
6852
Guido van Rossum403d68b2000-03-13 15:55:09 +00006853 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006854}
6855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856/* Concat to string or Unicode object giving a new Unicode object. */
6857
6858PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860{
6861 PyUnicodeObject *u = NULL, *v = NULL, *w;
6862
6863 /* Coerce the two arguments */
6864 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6865 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6868 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
6871 /* Shortcuts */
6872 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 Py_DECREF(v);
6874 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 }
6876 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Py_DECREF(u);
6878 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 }
6880
6881 /* Concat the two Unicode strings */
6882 w = _PyUnicode_New(u->length + v->length);
6883 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 Py_UNICODE_COPY(w->str, u->str, u->length);
6886 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6887
6888 Py_DECREF(u);
6889 Py_DECREF(v);
6890 return (PyObject *)w;
6891
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 Py_XDECREF(u);
6894 Py_XDECREF(v);
6895 return NULL;
6896}
6897
Walter Dörwald1ab83302007-05-18 17:15:44 +00006898void
6899PyUnicode_Append(PyObject **pleft, PyObject *right)
6900{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 PyObject *new;
6902 if (*pleft == NULL)
6903 return;
6904 if (right == NULL || !PyUnicode_Check(*pleft)) {
6905 Py_DECREF(*pleft);
6906 *pleft = NULL;
6907 return;
6908 }
6909 new = PyUnicode_Concat(*pleft, right);
6910 Py_DECREF(*pleft);
6911 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006912}
6913
6914void
6915PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6916{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006917 PyUnicode_Append(pleft, right);
6918 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006919}
6920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006924Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006925string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006926interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject *
6929unicode_count(PyUnicodeObject *self, PyObject *args)
6930{
6931 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006933 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 PyObject *result;
6935
Guido van Rossumb8872e62000-05-09 14:14:27 +00006936 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 return NULL;
6939
6940 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006941 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Thomas Wouters477c8d52006-05-27 19:21:47 +00006945 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Christian Heimes217cfd12007-12-02 14:31:20 +00006947 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006948 stringlib_count(self->str + start, end - start,
6949 substring->str, substring->length)
6950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 return result;
6955}
6956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006960Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006961to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006962handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6964'xmlcharrefreplace' as well as any other name registered with\n\
6965codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967static PyObject *
6968unicode_encode(PyUnicodeObject *self, PyObject *args)
6969{
6970 char *encoding = NULL;
6971 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006972 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6975 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006976 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006977 if (v == NULL)
6978 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006979 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006980 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006981 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006982 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006983 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006984 Py_DECREF(v);
6985 return NULL;
6986 }
6987 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006990 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006991}
6992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995\n\
6996Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
6999static PyObject*
7000unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7001{
7002 Py_UNICODE *e;
7003 Py_UNICODE *p;
7004 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007005 Py_UNICODE *qe;
7006 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 PyUnicodeObject *u;
7008 int tabsize = 8;
7009
7010 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
Thomas Wouters7e474022000-07-16 12:04:32 +00007013 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007014 i = 0; /* chars up to and including most recent \n or \r */
7015 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7016 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 for (p = self->str; p < e; p++)
7018 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 if (tabsize > 0) {
7020 incr = tabsize - (j % tabsize); /* cannot overflow */
7021 if (j > PY_SSIZE_T_MAX - incr)
7022 goto overflow1;
7023 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007024 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 if (j > PY_SSIZE_T_MAX - 1)
7028 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 j++;
7030 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 if (i > PY_SSIZE_T_MAX - j)
7032 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007034 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 }
7036 }
7037
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007038 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007040
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 /* Second pass: create output string and fill it */
7042 u = _PyUnicode_New(i + j);
7043 if (!u)
7044 return NULL;
7045
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007046 j = 0; /* same as in first pass */
7047 q = u->str; /* next output char */
7048 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049
7050 for (p = self->str; p < e; p++)
7051 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 if (tabsize > 0) {
7053 i = tabsize - (j % tabsize);
7054 j += i;
7055 while (i--) {
7056 if (q >= qe)
7057 goto overflow2;
7058 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007061 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 else {
7063 if (q >= qe)
7064 goto overflow2;
7065 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 if (*p == '\n' || *p == '\r')
7068 j = 0;
7069 }
7070
7071 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007072
7073 overflow2:
7074 Py_DECREF(u);
7075 overflow1:
7076 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078}
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082\n\
7083Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007084such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085arguments start and end are interpreted as in slice notation.\n\
7086\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088
7089static PyObject *
7090unicode_find(PyUnicodeObject *self, PyObject *args)
7091{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007092 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007093 Py_ssize_t start;
7094 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007095 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096
Christian Heimes9cd17752007-11-18 19:35:23 +00007097 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
Thomas Wouters477c8d52006-05-27 19:21:47 +00007100 result = stringlib_find_slice(
7101 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7102 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7103 start, end
7104 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007107
Christian Heimes217cfd12007-12-02 14:31:20 +00007108 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109}
7110
7111static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007112unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113{
7114 if (index < 0 || index >= self->length) {
7115 PyErr_SetString(PyExc_IndexError, "string index out of range");
7116 return NULL;
7117 }
7118
7119 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7120}
7121
Guido van Rossumc2504932007-09-18 19:42:40 +00007122/* Believe it or not, this produces the same value for ASCII strings
7123 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007125unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126{
Guido van Rossumc2504932007-09-18 19:42:40 +00007127 Py_ssize_t len;
7128 Py_UNICODE *p;
7129 long x;
7130
7131 if (self->hash != -1)
7132 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007133 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007134 p = self->str;
7135 x = *p << 7;
7136 while (--len >= 0)
7137 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007138 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007139 if (x == -1)
7140 x = -2;
7141 self->hash = x;
7142 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143}
7144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
7151unicode_index(PyUnicodeObject *self, PyObject *args)
7152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007154 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007155 Py_ssize_t start;
7156 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Christian Heimes9cd17752007-11-18 19:35:23 +00007158 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 result = stringlib_find_slice(
7162 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7163 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7164 start, end
7165 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 if (result < 0) {
7170 PyErr_SetString(PyExc_ValueError, "substring not found");
7171 return NULL;
7172 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007173
Christian Heimes217cfd12007-12-02 14:31:20 +00007174 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175}
7176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007180Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007181at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007184unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185{
7186 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7187 register const Py_UNICODE *e;
7188 int cased;
7189
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 /* Shortcut for single character strings */
7191 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007194 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007195 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007197
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 e = p + PyUnicode_GET_SIZE(self);
7199 cased = 0;
7200 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007202
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7204 return PyBool_FromLong(0);
7205 else if (!cased && Py_UNICODE_ISLOWER(ch))
7206 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007208 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209}
7210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007214Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007215at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
7217static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007218unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219{
7220 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7221 register const Py_UNICODE *e;
7222 int cased;
7223
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 /* Shortcut for single character strings */
7225 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007228 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007229 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 e = p + PyUnicode_GET_SIZE(self);
7233 cased = 0;
7234 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007236
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7238 return PyBool_FromLong(0);
7239 else if (!cased && Py_UNICODE_ISUPPER(ch))
7240 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007242 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007245PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007248Return True if S is a titlecased string and there is at least one\n\
7249character in S, i.e. upper- and titlecase characters may only\n\
7250follow uncased characters and lowercase characters only cased ones.\n\
7251Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007254unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
7256 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7257 register const Py_UNICODE *e;
7258 int cased, previous_is_cased;
7259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 /* Shortcut for single character strings */
7261 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7263 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007265 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007266 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007268
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 e = p + PyUnicode_GET_SIZE(self);
7270 cased = 0;
7271 previous_is_cased = 0;
7272 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007274
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7276 if (previous_is_cased)
7277 return PyBool_FromLong(0);
7278 previous_is_cased = 1;
7279 cased = 1;
7280 }
7281 else if (Py_UNICODE_ISLOWER(ch)) {
7282 if (!previous_is_cased)
7283 return PyBool_FromLong(0);
7284 previous_is_cased = 1;
7285 cased = 1;
7286 }
7287 else
7288 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007290 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291}
7292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007293PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007296Return True if all characters in S are whitespace\n\
7297and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007300unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301{
7302 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7303 register const Py_UNICODE *e;
7304
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 /* Shortcut for single character strings */
7306 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 Py_UNICODE_ISSPACE(*p))
7308 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007310 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007311 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007313
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 e = p + PyUnicode_GET_SIZE(self);
7315 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 if (!Py_UNICODE_ISSPACE(*p))
7317 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007319 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320}
7321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007324\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007325Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007327
7328static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007330{
7331 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7332 register const Py_UNICODE *e;
7333
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007334 /* Shortcut for single character strings */
7335 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 Py_UNICODE_ISALPHA(*p))
7337 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007338
7339 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007340 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007342
7343 e = p + PyUnicode_GET_SIZE(self);
7344 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 if (!Py_UNICODE_ISALPHA(*p))
7346 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007347 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007348 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007349}
7350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007351PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007353\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007354Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007356
7357static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007358unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007359{
7360 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7361 register const Py_UNICODE *e;
7362
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007363 /* Shortcut for single character strings */
7364 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 Py_UNICODE_ISALNUM(*p))
7366 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007367
7368 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007369 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007371
7372 e = p + PyUnicode_GET_SIZE(self);
7373 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 if (!Py_UNICODE_ISALNUM(*p))
7375 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007376 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007377 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007378}
7379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007383Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007384False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
7386static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007387unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
7389 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7390 register const Py_UNICODE *e;
7391
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 /* Shortcut for single character strings */
7393 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 Py_UNICODE_ISDECIMAL(*p))
7395 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007397 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007398 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007400
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 e = p + PyUnicode_GET_SIZE(self);
7402 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 if (!Py_UNICODE_ISDECIMAL(*p))
7404 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007406 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407}
7408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007409PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007412Return True if all characters in S are digits\n\
7413and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414
7415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007416unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417{
7418 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7419 register const Py_UNICODE *e;
7420
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 /* Shortcut for single character strings */
7422 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 Py_UNICODE_ISDIGIT(*p))
7424 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007426 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007427 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007429
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 e = p + PyUnicode_GET_SIZE(self);
7431 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 if (!Py_UNICODE_ISDIGIT(*p))
7433 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007435 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436}
7437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007438PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007441Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007442False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
7444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007445unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446{
7447 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7448 register const Py_UNICODE *e;
7449
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 /* Shortcut for single character strings */
7451 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 Py_UNICODE_ISNUMERIC(*p))
7453 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007455 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007456 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 e = p + PyUnicode_GET_SIZE(self);
7460 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 if (!Py_UNICODE_ISNUMERIC(*p))
7462 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007464 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Martin v. Löwis47383402007-08-15 07:32:56 +00007467int
7468PyUnicode_IsIdentifier(PyObject *self)
7469{
7470 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7471 register const Py_UNICODE *e;
7472
7473 /* Special case for empty strings */
7474 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007476
7477 /* PEP 3131 says that the first character must be in
7478 XID_Start and subsequent characters in XID_Continue,
7479 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007481 letters, digits, underscore). However, given the current
7482 definition of XID_Start and XID_Continue, it is sufficient
7483 to check just for these, except that _ must be allowed
7484 as starting an identifier. */
7485 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7486 return 0;
7487
7488 e = p + PyUnicode_GET_SIZE(self);
7489 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 if (!_PyUnicode_IsXidContinue(*p))
7491 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007492 }
7493 return 1;
7494}
7495
7496PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007498\n\
7499Return True if S is a valid identifier according\n\
7500to the language definition.");
7501
7502static PyObject*
7503unicode_isidentifier(PyObject *self)
7504{
7505 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7506}
7507
Georg Brandl559e5d72008-06-11 18:37:52 +00007508PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007510\n\
7511Return True if all characters in S are considered\n\
7512printable in repr() or S is empty, False otherwise.");
7513
7514static PyObject*
7515unicode_isprintable(PyObject *self)
7516{
7517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7518 register const Py_UNICODE *e;
7519
7520 /* Shortcut for single character strings */
7521 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7522 Py_RETURN_TRUE;
7523 }
7524
7525 e = p + PyUnicode_GET_SIZE(self);
7526 for (; p < e; p++) {
7527 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7528 Py_RETURN_FALSE;
7529 }
7530 }
7531 Py_RETURN_TRUE;
7532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536\n\
7537Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539
7540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007541unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007543 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544}
7545
Martin v. Löwis18e16552006-02-15 17:27:45 +00007546static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547unicode_length(PyUnicodeObject *self)
7548{
7549 return self->length;
7550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007555Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007556done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558static PyObject *
7559unicode_ljust(PyUnicodeObject *self, PyObject *args)
7560{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007561 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007562 Py_UNICODE fillchar = ' ';
7563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007564 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 return NULL;
7566
Tim Peters7a29bd52001-09-12 03:03:31 +00007567 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 Py_INCREF(self);
7569 return (PyObject*) self;
7570 }
7571
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007572 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573}
7574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007575PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007578Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
7580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007581unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 return fixup(self, fixlower);
7584}
7585
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007586#define LEFTSTRIP 0
7587#define RIGHTSTRIP 1
7588#define BOTHSTRIP 2
7589
7590/* Arrays indexed by above */
7591static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7592
7593#define STRIPNAME(i) (stripformat[i]+3)
7594
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007595/* externally visible for str.strip(unicode) */
7596PyObject *
7597_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7598{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007599 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7600 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7601 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7602 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7603 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007604
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007606
Benjamin Peterson14339b62009-01-31 16:36:08 +00007607 i = 0;
7608 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7610 i++;
7611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007612 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007613
Benjamin Peterson14339b62009-01-31 16:36:08 +00007614 j = len;
7615 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 do {
7617 j--;
7618 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7619 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007621
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 Py_INCREF(self);
7624 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007625 }
7626 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007628}
7629
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
7631static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007632do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7635 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007636
Benjamin Peterson14339b62009-01-31 16:36:08 +00007637 i = 0;
7638 if (striptype != RIGHTSTRIP) {
7639 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7640 i++;
7641 }
7642 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007643
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 j = len;
7645 if (striptype != LEFTSTRIP) {
7646 do {
7647 j--;
7648 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7649 j++;
7650 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007651
Benjamin Peterson14339b62009-01-31 16:36:08 +00007652 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7653 Py_INCREF(self);
7654 return (PyObject*)self;
7655 }
7656 else
7657 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658}
7659
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007660
7661static PyObject *
7662do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7663{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007664 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7667 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007668
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 if (sep != NULL && sep != Py_None) {
7670 if (PyUnicode_Check(sep))
7671 return _PyUnicode_XStrip(self, striptype, sep);
7672 else {
7673 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 "%s arg must be None or str",
7675 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 return NULL;
7677 }
7678 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007679
Benjamin Peterson14339b62009-01-31 16:36:08 +00007680 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007681}
7682
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007686\n\
7687Return a copy of the string S with leading and trailing\n\
7688whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007689If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007690
7691static PyObject *
7692unicode_strip(PyUnicodeObject *self, PyObject *args)
7693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007694 if (PyTuple_GET_SIZE(args) == 0)
7695 return do_strip(self, BOTHSTRIP); /* Common case */
7696 else
7697 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007698}
7699
7700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007701PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007703\n\
7704Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007705If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007706
7707static PyObject *
7708unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 if (PyTuple_GET_SIZE(args) == 0)
7711 return do_strip(self, LEFTSTRIP); /* Common case */
7712 else
7713 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007714}
7715
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007719\n\
7720Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007721If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007722
7723static PyObject *
7724unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7725{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726 if (PyTuple_GET_SIZE(args) == 0)
7727 return do_strip(self, RIGHTSTRIP); /* Common case */
7728 else
7729 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007730}
7731
7732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007734unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735{
7736 PyUnicodeObject *u;
7737 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007739 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
Georg Brandl222de0f2009-04-12 12:01:50 +00007741 if (len < 1) {
7742 Py_INCREF(unicode_empty);
7743 return (PyObject *)unicode_empty;
7744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
Tim Peters7a29bd52001-09-12 03:03:31 +00007746 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 /* no repeat, return original string */
7748 Py_INCREF(str);
7749 return (PyObject*) str;
7750 }
Tim Peters8f422462000-09-09 06:13:41 +00007751
7752 /* ensure # of chars needed doesn't overflow int and # of bytes
7753 * needed doesn't overflow size_t
7754 */
7755 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007756 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007757 PyErr_SetString(PyExc_OverflowError,
7758 "repeated string is too long");
7759 return NULL;
7760 }
7761 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7762 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7763 PyErr_SetString(PyExc_OverflowError,
7764 "repeated string is too long");
7765 return NULL;
7766 }
7767 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 if (!u)
7769 return NULL;
7770
7771 p = u->str;
7772
Georg Brandl222de0f2009-04-12 12:01:50 +00007773 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007774 Py_UNICODE_FILL(p, str->str[0], len);
7775 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007776 Py_ssize_t done = str->length; /* number of characters copied this far */
7777 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007779 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007780 Py_UNICODE_COPY(p+done, p, n);
7781 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 }
7784
7785 return (PyObject*) u;
7786}
7787
7788PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 PyObject *subobj,
7790 PyObject *replobj,
7791 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792{
7793 PyObject *self;
7794 PyObject *str1;
7795 PyObject *str2;
7796 PyObject *result;
7797
7798 self = PyUnicode_FromObject(obj);
7799 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 str1 = PyUnicode_FromObject(subobj);
7802 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_DECREF(self);
7804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 }
7806 str2 = PyUnicode_FromObject(replobj);
7807 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 Py_DECREF(self);
7809 Py_DECREF(str1);
7810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 }
Tim Petersced69f82003-09-16 20:30:58 +00007812 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 (PyUnicodeObject *)str1,
7814 (PyUnicodeObject *)str2,
7815 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 Py_DECREF(self);
7817 Py_DECREF(str1);
7818 Py_DECREF(str2);
7819 return result;
7820}
7821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824\n\
7825Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007826old replaced by new. If the optional argument count is\n\
7827given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
7829static PyObject*
7830unicode_replace(PyUnicodeObject *self, PyObject *args)
7831{
7832 PyUnicodeObject *str1;
7833 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 PyObject *result;
7836
Martin v. Löwis18e16552006-02-15 17:27:45 +00007837 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 return NULL;
7839 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7840 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007843 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 Py_DECREF(str1);
7845 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847
7848 result = replace(self, str1, str2, maxcount);
7849
7850 Py_DECREF(str1);
7851 Py_DECREF(str2);
7852 return result;
7853}
7854
7855static
7856PyObject *unicode_repr(PyObject *unicode)
7857{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007858 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007859 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007860 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7861 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7862
7863 /* XXX(nnorwitz): rather than over-allocating, it would be
7864 better to choose a different scheme. Perhaps scan the
7865 first N-chars of the string and allocate based on that size.
7866 */
7867 /* Initial allocation is based on the longest-possible unichr
7868 escape.
7869
7870 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7871 unichr, so in this case it's the longest unichr escape. In
7872 narrow (UTF-16) builds this is five chars per source unichr
7873 since there are two unichrs in the surrogate pair, so in narrow
7874 (UTF-16) builds it's not the longest unichr escape.
7875
7876 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7877 so in the narrow (UTF-16) build case it's the longest unichr
7878 escape.
7879 */
7880
Walter Dörwald1ab83302007-05-18 17:15:44 +00007881 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007883#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007885#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007887#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007889 if (repr == NULL)
7890 return NULL;
7891
Walter Dörwald1ab83302007-05-18 17:15:44 +00007892 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007893
7894 /* Add quote */
7895 *p++ = (findchar(s, size, '\'') &&
7896 !findchar(s, size, '"')) ? '"' : '\'';
7897 while (size-- > 0) {
7898 Py_UNICODE ch = *s++;
7899
7900 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007901 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007902 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007903 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007904 continue;
7905 }
7906
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007908 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007909 *p++ = '\\';
7910 *p++ = 't';
7911 }
7912 else if (ch == '\n') {
7913 *p++ = '\\';
7914 *p++ = 'n';
7915 }
7916 else if (ch == '\r') {
7917 *p++ = '\\';
7918 *p++ = 'r';
7919 }
7920
7921 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007922 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007923 *p++ = '\\';
7924 *p++ = 'x';
7925 *p++ = hexdigits[(ch >> 4) & 0x000F];
7926 *p++ = hexdigits[ch & 0x000F];
7927 }
7928
Georg Brandl559e5d72008-06-11 18:37:52 +00007929 /* Copy ASCII characters as-is */
7930 else if (ch < 0x7F) {
7931 *p++ = ch;
7932 }
7933
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007935 else {
7936 Py_UCS4 ucs = ch;
7937
7938#ifndef Py_UNICODE_WIDE
7939 Py_UNICODE ch2 = 0;
7940 /* Get code point from surrogate pair */
7941 if (size > 0) {
7942 ch2 = *s;
7943 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007948 size--;
7949 }
7950 }
7951#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007952 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007953 (categories Z* and C* except ASCII space)
7954 */
7955 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7956 /* Map 8-bit characters to '\xhh' */
7957 if (ucs <= 0xff) {
7958 *p++ = '\\';
7959 *p++ = 'x';
7960 *p++ = hexdigits[(ch >> 4) & 0x000F];
7961 *p++ = hexdigits[ch & 0x000F];
7962 }
7963 /* Map 21-bit characters to '\U00xxxxxx' */
7964 else if (ucs >= 0x10000) {
7965 *p++ = '\\';
7966 *p++ = 'U';
7967 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7968 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7969 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7970 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7971 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7972 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7973 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7974 *p++ = hexdigits[ucs & 0x0000000F];
7975 }
7976 /* Map 16-bit characters to '\uxxxx' */
7977 else {
7978 *p++ = '\\';
7979 *p++ = 'u';
7980 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7981 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7982 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7983 *p++ = hexdigits[ucs & 0x000F];
7984 }
7985 }
7986 /* Copy characters as-is */
7987 else {
7988 *p++ = ch;
7989#ifndef Py_UNICODE_WIDE
7990 if (ucs >= 0x10000)
7991 *p++ = ch2;
7992#endif
7993 }
7994 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007995 }
7996 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007997 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007998
7999 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008000 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008001 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008004PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006\n\
8007Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008008such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009arguments start and end are interpreted as in slice notation.\n\
8010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008011Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013static PyObject *
8014unicode_rfind(PyUnicodeObject *self, PyObject *args)
8015{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008016 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008017 Py_ssize_t start;
8018 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008019 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Christian Heimes9cd17752007-11-18 19:35:23 +00008021 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Thomas Wouters477c8d52006-05-27 19:21:47 +00008024 result = stringlib_rfind_slice(
8025 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8026 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8027 start, end
8028 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029
8030 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008031
Christian Heimes217cfd12007-12-02 14:31:20 +00008032 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033}
8034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008035PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008038Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039
8040static PyObject *
8041unicode_rindex(PyUnicodeObject *self, PyObject *args)
8042{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008044 Py_ssize_t start;
8045 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008046 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
Christian Heimes9cd17752007-11-18 19:35:23 +00008048 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
Thomas Wouters477c8d52006-05-27 19:21:47 +00008051 result = stringlib_rfind_slice(
8052 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8053 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8054 start, end
8055 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
8057 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008058
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 if (result < 0) {
8060 PyErr_SetString(PyExc_ValueError, "substring not found");
8061 return NULL;
8062 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008063 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064}
8065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008066PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008069Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008070done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072static PyObject *
8073unicode_rjust(PyUnicodeObject *self, PyObject *args)
8074{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008075 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008076 Py_UNICODE fillchar = ' ';
8077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008078 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 return NULL;
8080
Tim Peters7a29bd52001-09-12 03:03:31 +00008081 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 Py_INCREF(self);
8083 return (PyObject*) self;
8084 }
8085
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008086 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087}
8088
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 PyObject *sep,
8091 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
8093 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008094
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 s = PyUnicode_FromObject(s);
8096 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 if (sep != NULL) {
8099 sep = PyUnicode_FromObject(sep);
8100 if (sep == NULL) {
8101 Py_DECREF(s);
8102 return NULL;
8103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
8105
8106 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8107
8108 Py_DECREF(s);
8109 Py_XDECREF(sep);
8110 return result;
8111}
8112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008113PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115\n\
8116Return a list of the words in S, using sep as the\n\
8117delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008118splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008119whitespace string is a separator and empty strings are\n\
8120removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121
8122static PyObject*
8123unicode_split(PyUnicodeObject *self, PyObject *args)
8124{
8125 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008126 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return NULL;
8130
8131 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137}
8138
Thomas Wouters477c8d52006-05-27 19:21:47 +00008139PyObject *
8140PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8141{
8142 PyObject* str_obj;
8143 PyObject* sep_obj;
8144 PyObject* out;
8145
8146 str_obj = PyUnicode_FromObject(str_in);
8147 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008149 sep_obj = PyUnicode_FromObject(sep_in);
8150 if (!sep_obj) {
8151 Py_DECREF(str_obj);
8152 return NULL;
8153 }
8154
8155 out = stringlib_partition(
8156 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8157 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8158 );
8159
8160 Py_DECREF(sep_obj);
8161 Py_DECREF(str_obj);
8162
8163 return out;
8164}
8165
8166
8167PyObject *
8168PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8169{
8170 PyObject* str_obj;
8171 PyObject* sep_obj;
8172 PyObject* out;
8173
8174 str_obj = PyUnicode_FromObject(str_in);
8175 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008177 sep_obj = PyUnicode_FromObject(sep_in);
8178 if (!sep_obj) {
8179 Py_DECREF(str_obj);
8180 return NULL;
8181 }
8182
8183 out = stringlib_rpartition(
8184 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8185 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8186 );
8187
8188 Py_DECREF(sep_obj);
8189 Py_DECREF(str_obj);
8190
8191 return out;
8192}
8193
8194PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008196\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008197Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008199found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200
8201static PyObject*
8202unicode_partition(PyUnicodeObject *self, PyObject *separator)
8203{
8204 return PyUnicode_Partition((PyObject *)self, separator);
8205}
8206
8207PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008210Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008212separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008213
8214static PyObject*
8215unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8216{
8217 return PyUnicode_RPartition((PyObject *)self, separator);
8218}
8219
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008220PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 PyObject *sep,
8222 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008223{
8224 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008225
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008226 s = PyUnicode_FromObject(s);
8227 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 if (sep != NULL) {
8230 sep = PyUnicode_FromObject(sep);
8231 if (sep == NULL) {
8232 Py_DECREF(s);
8233 return NULL;
8234 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008235 }
8236
8237 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8238
8239 Py_DECREF(s);
8240 Py_XDECREF(sep);
8241 return result;
8242}
8243
8244PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008246\n\
8247Return a list of the words in S, using sep as the\n\
8248delimiter string, starting at the end of the string and\n\
8249working to the front. If maxsplit is given, at most maxsplit\n\
8250splits are done. If sep is not specified, any whitespace string\n\
8251is a separator.");
8252
8253static PyObject*
8254unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8255{
8256 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008257 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008258
Martin v. Löwis18e16552006-02-15 17:27:45 +00008259 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008260 return NULL;
8261
8262 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008264 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008266 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008268}
8269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008270PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272\n\
8273Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008274Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008275is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
8277static PyObject*
8278unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8279{
Guido van Rossum86662912000-04-11 15:38:46 +00008280 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Guido van Rossum86662912000-04-11 15:38:46 +00008282 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 return NULL;
8284
Guido van Rossum86662912000-04-11 15:38:46 +00008285 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286}
8287
8288static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008289PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Walter Dörwald346737f2007-05-31 10:44:43 +00008291 if (PyUnicode_CheckExact(self)) {
8292 Py_INCREF(self);
8293 return self;
8294 } else
8295 /* Subtype -- return genuine unicode string with the same value. */
8296 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8297 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298}
8299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008300PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302\n\
8303Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008304and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
8306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008307unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 return fixup(self, fixswapcase);
8310}
8311
Georg Brandlceee0772007-11-27 23:48:05 +00008312PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008314\n\
8315Return a translation table usable for str.translate().\n\
8316If there is only one argument, it must be a dictionary mapping Unicode\n\
8317ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008318Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008319If there are two arguments, they must be strings of equal length, and\n\
8320in the resulting dictionary, each character in x will be mapped to the\n\
8321character at the same position in y. If there is a third argument, it\n\
8322must be a string, whose characters will be mapped to None in the result.");
8323
8324static PyObject*
8325unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8326{
8327 PyObject *x, *y = NULL, *z = NULL;
8328 PyObject *new = NULL, *key, *value;
8329 Py_ssize_t i = 0;
8330 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008331
Georg Brandlceee0772007-11-27 23:48:05 +00008332 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8333 return NULL;
8334 new = PyDict_New();
8335 if (!new)
8336 return NULL;
8337 if (y != NULL) {
8338 /* x must be a string too, of equal length */
8339 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8340 if (!PyUnicode_Check(x)) {
8341 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8342 "be a string if there is a second argument");
8343 goto err;
8344 }
8345 if (PyUnicode_GET_SIZE(x) != ylen) {
8346 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8347 "arguments must have equal length");
8348 goto err;
8349 }
8350 /* create entries for translating chars in x to those in y */
8351 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008352 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8353 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008354 if (!key || !value)
8355 goto err;
8356 res = PyDict_SetItem(new, key, value);
8357 Py_DECREF(key);
8358 Py_DECREF(value);
8359 if (res < 0)
8360 goto err;
8361 }
8362 /* create entries for deleting chars in z */
8363 if (z != NULL) {
8364 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008366 if (!key)
8367 goto err;
8368 res = PyDict_SetItem(new, key, Py_None);
8369 Py_DECREF(key);
8370 if (res < 0)
8371 goto err;
8372 }
8373 }
8374 } else {
8375 /* x must be a dict */
8376 if (!PyDict_Check(x)) {
8377 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8378 "to maketrans it must be a dict");
8379 goto err;
8380 }
8381 /* copy entries into the new dict, converting string keys to int keys */
8382 while (PyDict_Next(x, &i, &key, &value)) {
8383 if (PyUnicode_Check(key)) {
8384 /* convert string keys to integer keys */
8385 PyObject *newkey;
8386 if (PyUnicode_GET_SIZE(key) != 1) {
8387 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8388 "table must be of length 1");
8389 goto err;
8390 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008391 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008392 if (!newkey)
8393 goto err;
8394 res = PyDict_SetItem(new, newkey, value);
8395 Py_DECREF(newkey);
8396 if (res < 0)
8397 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008398 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008399 /* just keep integer keys */
8400 if (PyDict_SetItem(new, key, value) < 0)
8401 goto err;
8402 } else {
8403 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8404 "be strings or integers");
8405 goto err;
8406 }
8407 }
8408 }
8409 return new;
8410 err:
8411 Py_DECREF(new);
8412 return NULL;
8413}
8414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008415PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417\n\
8418Return a copy of the string S, where all characters have been mapped\n\
8419through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008420Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008421Unmapped characters are left untouched. Characters mapped to None\n\
8422are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423
8424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008425unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426{
Georg Brandlceee0772007-11-27 23:48:05 +00008427 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428}
8429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008430PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008433Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434
8435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008436unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 return fixup(self, fixupper);
8439}
8440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008441PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008444Pad a numeric string S with zeros on the left, to fill a field\n\
8445of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
8447static PyObject *
8448unicode_zfill(PyUnicodeObject *self, PyObject *args)
8449{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 PyUnicodeObject *u;
8452
Martin v. Löwis18e16552006-02-15 17:27:45 +00008453 Py_ssize_t width;
8454 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 return NULL;
8456
8457 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008458 if (PyUnicode_CheckExact(self)) {
8459 Py_INCREF(self);
8460 return (PyObject*) self;
8461 }
8462 else
8463 return PyUnicode_FromUnicode(
8464 PyUnicode_AS_UNICODE(self),
8465 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
8468
8469 fill = width - self->length;
8470
8471 u = pad(self, fill, 0, '0');
8472
Walter Dörwald068325e2002-04-15 13:36:47 +00008473 if (u == NULL)
8474 return NULL;
8475
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 if (u->str[fill] == '+' || u->str[fill] == '-') {
8477 /* move sign to beginning of string */
8478 u->str[0] = u->str[fill];
8479 u->str[fill] = '0';
8480 }
8481
8482 return (PyObject*) u;
8483}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485#if 0
8486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008487unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488{
Christian Heimes2202f872008-02-06 14:31:34 +00008489 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490}
8491#endif
8492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008493PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008496Return True if S starts with the specified prefix, False otherwise.\n\
8497With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008498With optional end, stop comparing S at that position.\n\
8499prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
8501static PyObject *
8502unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008505 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008508 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008509 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008511 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8513 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008514 if (PyTuple_Check(subobj)) {
8515 Py_ssize_t i;
8516 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8517 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008519 if (substring == NULL)
8520 return NULL;
8521 result = tailmatch(self, substring, start, end, -1);
8522 Py_DECREF(substring);
8523 if (result) {
8524 Py_RETURN_TRUE;
8525 }
8526 }
8527 /* nothing matched */
8528 Py_RETURN_FALSE;
8529 }
8530 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008533 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008535 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536}
8537
8538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008539PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008542Return True if S ends with the specified suffix, False otherwise.\n\
8543With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008544With optional end, stop comparing S at that position.\n\
8545suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
8547static PyObject *
8548unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008553 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008554 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008555 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008557 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8559 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008560 if (PyTuple_Check(subobj)) {
8561 Py_ssize_t i;
8562 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8563 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008565 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567 result = tailmatch(self, substring, start, end, +1);
8568 Py_DECREF(substring);
8569 if (result) {
8570 Py_RETURN_TRUE;
8571 }
8572 }
8573 Py_RETURN_FALSE;
8574 }
8575 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008579 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008581 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582}
8583
Eric Smith8c663262007-08-25 02:26:07 +00008584#include "stringlib/string_format.h"
8585
8586PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008588\n\
8589");
8590
Eric Smith4a7d76d2008-05-30 18:10:19 +00008591static PyObject *
8592unicode__format__(PyObject* self, PyObject* args)
8593{
8594 PyObject *format_spec;
8595
8596 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8597 return NULL;
8598
8599 return _PyUnicode_FormatAdvanced(self,
8600 PyUnicode_AS_UNICODE(format_spec),
8601 PyUnicode_GET_SIZE(format_spec));
8602}
8603
Eric Smith8c663262007-08-25 02:26:07 +00008604PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008606\n\
8607");
8608
8609static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008610unicode__sizeof__(PyUnicodeObject *v)
8611{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008612 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8613 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008614}
8615
8616PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008618
8619static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008620unicode_getnewargs(PyUnicodeObject *v)
8621{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008623}
8624
8625
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626static PyMethodDef unicode_methods[] = {
8627
8628 /* Order is according to common usage: often used methods should
8629 appear first, since lookup is done sequentially. */
8630
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008631 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8632 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8633 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008634 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008635 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8636 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8637 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8638 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8639 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8640 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8641 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008643 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8644 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8645 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008646 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008647 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8648 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8649 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008650 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008651 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008652 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008653 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008654 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8655 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8656 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8657 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8658 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8659 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8660 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8661 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8662 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8663 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8664 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8665 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8666 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8667 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008668 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008669 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008670 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008671 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008672 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008673 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8674 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008675 {"maketrans", (PyCFunction) unicode_maketrans,
8676 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008677 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008678#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008679 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680#endif
8681
8682#if 0
8683 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008684 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685#endif
8686
Benjamin Peterson14339b62009-01-31 16:36:08 +00008687 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 {NULL, NULL}
8689};
8690
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008691static PyObject *
8692unicode_mod(PyObject *v, PyObject *w)
8693{
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 if (!PyUnicode_Check(v)) {
8695 Py_INCREF(Py_NotImplemented);
8696 return Py_NotImplemented;
8697 }
8698 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008699}
8700
8701static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 0, /*nb_add*/
8703 0, /*nb_subtract*/
8704 0, /*nb_multiply*/
8705 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008706};
8707
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709 (lenfunc) unicode_length, /* sq_length */
8710 PyUnicode_Concat, /* sq_concat */
8711 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8712 (ssizeargfunc) unicode_getitem, /* sq_item */
8713 0, /* sq_slice */
8714 0, /* sq_ass_item */
8715 0, /* sq_ass_slice */
8716 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717};
8718
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008719static PyObject*
8720unicode_subscript(PyUnicodeObject* self, PyObject* item)
8721{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008722 if (PyIndex_Check(item)) {
8723 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008724 if (i == -1 && PyErr_Occurred())
8725 return NULL;
8726 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008727 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008728 return unicode_getitem(self, i);
8729 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008731 Py_UNICODE* source_buf;
8732 Py_UNICODE* result_buf;
8733 PyObject* result;
8734
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008735 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008737 return NULL;
8738 }
8739
8740 if (slicelength <= 0) {
8741 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008742 } else if (start == 0 && step == 1 && slicelength == self->length &&
8743 PyUnicode_CheckExact(self)) {
8744 Py_INCREF(self);
8745 return (PyObject *)self;
8746 } else if (step == 1) {
8747 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008748 } else {
8749 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008750 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8751 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 if (result_buf == NULL)
8754 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008755
8756 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8757 result_buf[i] = source_buf[cur];
8758 }
Tim Petersced69f82003-09-16 20:30:58 +00008759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008760 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008761 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008762 return result;
8763 }
8764 } else {
8765 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8766 return NULL;
8767 }
8768}
8769
8770static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771 (lenfunc)unicode_length, /* mp_length */
8772 (binaryfunc)unicode_subscript, /* mp_subscript */
8773 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008774};
8775
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777/* Helpers for PyUnicode_Format() */
8778
8779static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008780getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008782 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 (*p_argidx)++;
8785 if (arglen < 0)
8786 return args;
8787 else
8788 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
8790 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 return NULL;
8793}
8794
Eric Smith0923d1d2009-04-16 20:16:10 +00008795static void
8796strtounicode(Py_UNICODE *buffer, const char *charbuffer, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008798 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 for (i = len - 1; i >= 0; i--)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801}
8802
Neal Norwitzfc76d632006-01-10 06:03:13 +00008803static int
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804formatfloat(Py_UNICODE *buf,
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 size_t buflen,
8806 int flags,
8807 int prec,
8808 int type,
8809 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810{
Eric Smith0923d1d2009-04-16 20:16:10 +00008811 /* eric.smith: To minimize disturbances in PyUnicode_Format (the
8812 only caller of this routine), I'm going to keep the existing
8813 API to this function. That means that we'll allocate memory and
8814 then copy back into the supplied buffer. But that's better than
8815 all of the changes that would be required in PyUnicode_Format
8816 because it does lots of memory management tricks. */
8817
8818 char* p = NULL;
8819 int result = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 double x;
Eric Smith0923d1d2009-04-16 20:16:10 +00008821 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +00008822
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 x = PyFloat_AsDouble(v);
8824 if (x == -1.0 && PyErr_Occurred())
Eric Smith0923d1d2009-04-16 20:16:10 +00008825 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008828
Mark Dickinson4feda2a2009-03-29 16:34:21 +00008829 /* make sure that the decimal representation of precision really does
8830 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8831 if (prec > 0x7fffffffL) {
8832 PyErr_SetString(PyExc_OverflowError,
8833 "outrageously large precision "
8834 "for formatted float");
Eric Smith0923d1d2009-04-16 20:16:10 +00008835 goto done;
Mark Dickinson4feda2a2009-03-29 16:34:21 +00008836 }
8837
Mark Dickinsonc8a608c2009-03-29 15:19:47 +00008838 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008840
Benjamin Peterson14339b62009-01-31 16:36:08 +00008841 if (((type == 'g' || type == 'G') &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith0923d1d2009-04-16 20:16:10 +00008843 ((type == 'f' || type == 'F') &&
8844 buflen <= (size_t)53 + (size_t)prec)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 PyErr_SetString(PyExc_OverflowError,
8846 "formatted float is too long (precision too large?)");
Eric Smith0923d1d2009-04-16 20:16:10 +00008847 goto done;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008848 }
Eric Smith0923d1d2009-04-16 20:16:10 +00008849
8850 p = PyOS_double_to_string(x, type, prec,
8851 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8852 len = strlen(p);
8853 if (len+1 >= buflen) {
8854 /* Caller supplied buffer is not large enough. */
8855 PyErr_NoMemory();
8856 goto done;
8857 }
8858 strtounicode(buf, p, len);
8859 result = Py_SAFE_DOWNCAST(len, Py_ssize_t, int);
8860
8861done:
8862 PyMem_Free(p);
8863 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864}
8865
Tim Peters38fd5b62000-09-21 05:43:11 +00008866static PyObject*
8867formatlong(PyObject *val, int flags, int prec, int type)
8868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008869 char *buf;
8870 int len;
8871 PyObject *str; /* temporary string object. */
8872 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008873
Benjamin Peterson14339b62009-01-31 16:36:08 +00008874 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8875 if (!str)
8876 return NULL;
8877 result = PyUnicode_FromStringAndSize(buf, len);
8878 Py_DECREF(str);
8879 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008880}
8881
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882static int
8883formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008884 size_t buflen,
8885 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008887 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008888 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 if (PyUnicode_GET_SIZE(v) == 1) {
8890 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8891 buf[1] = '\0';
8892 return 1;
8893 }
8894#ifndef Py_UNICODE_WIDE
8895 if (PyUnicode_GET_SIZE(v) == 2) {
8896 /* Decode a valid surrogate pair */
8897 int c0 = PyUnicode_AS_UNICODE(v)[0];
8898 int c1 = PyUnicode_AS_UNICODE(v)[1];
8899 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8900 0xDC00 <= c1 && c1 <= 0xDFFF) {
8901 buf[0] = c0;
8902 buf[1] = c1;
8903 buf[2] = '\0';
8904 return 2;
8905 }
8906 }
8907#endif
8908 goto onError;
8909 }
8910 else {
8911 /* Integer input truncated to a character */
8912 long x;
8913 x = PyLong_AsLong(v);
8914 if (x == -1 && PyErr_Occurred())
8915 goto onError;
8916
8917 if (x < 0 || x > 0x10ffff) {
8918 PyErr_SetString(PyExc_OverflowError,
8919 "%c arg not in range(0x110000)");
8920 return -1;
8921 }
8922
8923#ifndef Py_UNICODE_WIDE
8924 if (x > 0xffff) {
8925 x -= 0x10000;
8926 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8927 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8928 return 2;
8929 }
8930#endif
8931 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 buf[1] = '\0';
8933 return 1;
8934 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008935
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008937 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008939 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940}
8941
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008942/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8943
8944 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8945 chars are formatted. XXX This is a magic number. Each formatting
8946 routine does bounds checking to ensure no overflow, but a better
8947 solution may be to malloc a buffer of appropriate size for each
8948 format. For now, the current solution is sufficient.
8949*/
8950#define FORMATBUFLEN (size_t)120
8951
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954{
8955 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008956 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 int args_owned = 0;
8958 PyUnicodeObject *result = NULL;
8959 PyObject *dict = NULL;
8960 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 PyErr_BadInternalCall();
8964 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 }
8966 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008967 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 fmt = PyUnicode_AS_UNICODE(uformat);
8970 fmtcnt = PyUnicode_GET_SIZE(uformat);
8971
8972 reslen = rescnt = fmtcnt + 100;
8973 result = _PyUnicode_New(reslen);
8974 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 res = PyUnicode_AS_UNICODE(result);
8977
8978 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 arglen = PyTuple_Size(args);
8980 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 }
8982 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 arglen = -1;
8984 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008986 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008987 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
8990 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 if (*fmt != '%') {
8992 if (--rescnt < 0) {
8993 rescnt = fmtcnt + 100;
8994 reslen += rescnt;
8995 if (_PyUnicode_Resize(&result, reslen) < 0)
8996 goto onError;
8997 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8998 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009001 }
9002 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 /* Got a format specifier */
9004 int flags = 0;
9005 Py_ssize_t width = -1;
9006 int prec = -1;
9007 Py_UNICODE c = '\0';
9008 Py_UNICODE fill;
9009 int isnumok;
9010 PyObject *v = NULL;
9011 PyObject *temp = NULL;
9012 Py_UNICODE *pbuf;
9013 Py_UNICODE sign;
9014 Py_ssize_t len;
9015 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 fmt++;
9018 if (*fmt == '(') {
9019 Py_UNICODE *keystart;
9020 Py_ssize_t keylen;
9021 PyObject *key;
9022 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009023
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 if (dict == NULL) {
9025 PyErr_SetString(PyExc_TypeError,
9026 "format requires a mapping");
9027 goto onError;
9028 }
9029 ++fmt;
9030 --fmtcnt;
9031 keystart = fmt;
9032 /* Skip over balanced parentheses */
9033 while (pcount > 0 && --fmtcnt >= 0) {
9034 if (*fmt == ')')
9035 --pcount;
9036 else if (*fmt == '(')
9037 ++pcount;
9038 fmt++;
9039 }
9040 keylen = fmt - keystart - 1;
9041 if (fmtcnt < 0 || pcount > 0) {
9042 PyErr_SetString(PyExc_ValueError,
9043 "incomplete format key");
9044 goto onError;
9045 }
9046#if 0
9047 /* keys are converted to strings using UTF-8 and
9048 then looked up since Python uses strings to hold
9049 variables names etc. in its namespaces and we
9050 wouldn't want to break common idioms. */
9051 key = PyUnicode_EncodeUTF8(keystart,
9052 keylen,
9053 NULL);
9054#else
9055 key = PyUnicode_FromUnicode(keystart, keylen);
9056#endif
9057 if (key == NULL)
9058 goto onError;
9059 if (args_owned) {
9060 Py_DECREF(args);
9061 args_owned = 0;
9062 }
9063 args = PyObject_GetItem(dict, key);
9064 Py_DECREF(key);
9065 if (args == NULL) {
9066 goto onError;
9067 }
9068 args_owned = 1;
9069 arglen = -1;
9070 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009071 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 while (--fmtcnt >= 0) {
9073 switch (c = *fmt++) {
9074 case '-': flags |= F_LJUST; continue;
9075 case '+': flags |= F_SIGN; continue;
9076 case ' ': flags |= F_BLANK; continue;
9077 case '#': flags |= F_ALT; continue;
9078 case '0': flags |= F_ZERO; continue;
9079 }
9080 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 if (c == '*') {
9083 v = getnextarg(args, arglen, &argidx);
9084 if (v == NULL)
9085 goto onError;
9086 if (!PyLong_Check(v)) {
9087 PyErr_SetString(PyExc_TypeError,
9088 "* wants int");
9089 goto onError;
9090 }
9091 width = PyLong_AsLong(v);
9092 if (width == -1 && PyErr_Occurred())
9093 goto onError;
9094 if (width < 0) {
9095 flags |= F_LJUST;
9096 width = -width;
9097 }
9098 if (--fmtcnt >= 0)
9099 c = *fmt++;
9100 }
9101 else if (c >= '0' && c <= '9') {
9102 width = c - '0';
9103 while (--fmtcnt >= 0) {
9104 c = *fmt++;
9105 if (c < '0' || c > '9')
9106 break;
9107 if ((width*10) / 10 != width) {
9108 PyErr_SetString(PyExc_ValueError,
9109 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009110 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 }
9112 width = width*10 + (c - '0');
9113 }
9114 }
9115 if (c == '.') {
9116 prec = 0;
9117 if (--fmtcnt >= 0)
9118 c = *fmt++;
9119 if (c == '*') {
9120 v = getnextarg(args, arglen, &argidx);
9121 if (v == NULL)
9122 goto onError;
9123 if (!PyLong_Check(v)) {
9124 PyErr_SetString(PyExc_TypeError,
9125 "* wants int");
9126 goto onError;
9127 }
9128 prec = PyLong_AsLong(v);
9129 if (prec == -1 && PyErr_Occurred())
9130 goto onError;
9131 if (prec < 0)
9132 prec = 0;
9133 if (--fmtcnt >= 0)
9134 c = *fmt++;
9135 }
9136 else if (c >= '0' && c <= '9') {
9137 prec = c - '0';
9138 while (--fmtcnt >= 0) {
9139 c = Py_CHARMASK(*fmt++);
9140 if (c < '0' || c > '9')
9141 break;
9142 if ((prec*10) / 10 != prec) {
9143 PyErr_SetString(PyExc_ValueError,
9144 "prec too big");
9145 goto onError;
9146 }
9147 prec = prec*10 + (c - '0');
9148 }
9149 }
9150 } /* prec */
9151 if (fmtcnt >= 0) {
9152 if (c == 'h' || c == 'l' || c == 'L') {
9153 if (--fmtcnt >= 0)
9154 c = *fmt++;
9155 }
9156 }
9157 if (fmtcnt < 0) {
9158 PyErr_SetString(PyExc_ValueError,
9159 "incomplete format");
9160 goto onError;
9161 }
9162 if (c != '%') {
9163 v = getnextarg(args, arglen, &argidx);
9164 if (v == NULL)
9165 goto onError;
9166 }
9167 sign = 0;
9168 fill = ' ';
9169 switch (c) {
9170
9171 case '%':
9172 pbuf = formatbuf;
9173 /* presume that buffer length is at least 1 */
9174 pbuf[0] = '%';
9175 len = 1;
9176 break;
9177
9178 case 's':
9179 case 'r':
9180 case 'a':
9181 if (PyUnicode_Check(v) && c == 's') {
9182 temp = v;
9183 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 }
9185 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 if (c == 's')
9187 temp = PyObject_Str(v);
9188 else if (c == 'r')
9189 temp = PyObject_Repr(v);
9190 else
9191 temp = PyObject_ASCII(v);
9192 if (temp == NULL)
9193 goto onError;
9194 if (PyUnicode_Check(temp))
9195 /* nothing to do */;
9196 else {
9197 Py_DECREF(temp);
9198 PyErr_SetString(PyExc_TypeError,
9199 "%s argument has non-string str()");
9200 goto onError;
9201 }
9202 }
9203 pbuf = PyUnicode_AS_UNICODE(temp);
9204 len = PyUnicode_GET_SIZE(temp);
9205 if (prec >= 0 && len > prec)
9206 len = prec;
9207 break;
9208
9209 case 'i':
9210 case 'd':
9211 case 'u':
9212 case 'o':
9213 case 'x':
9214 case 'X':
9215 if (c == 'i')
9216 c = 'd';
9217 isnumok = 0;
9218 if (PyNumber_Check(v)) {
9219 PyObject *iobj=NULL;
9220
9221 if (PyLong_Check(v)) {
9222 iobj = v;
9223 Py_INCREF(iobj);
9224 }
9225 else {
9226 iobj = PyNumber_Long(v);
9227 }
9228 if (iobj!=NULL) {
9229 if (PyLong_Check(iobj)) {
9230 isnumok = 1;
9231 temp = formatlong(iobj, flags, prec, c);
9232 Py_DECREF(iobj);
9233 if (!temp)
9234 goto onError;
9235 pbuf = PyUnicode_AS_UNICODE(temp);
9236 len = PyUnicode_GET_SIZE(temp);
9237 sign = 1;
9238 }
9239 else {
9240 Py_DECREF(iobj);
9241 }
9242 }
9243 }
9244 if (!isnumok) {
9245 PyErr_Format(PyExc_TypeError,
9246 "%%%c format: a number is required, "
9247 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9248 goto onError;
9249 }
9250 if (flags & F_ZERO)
9251 fill = '0';
9252 break;
9253
9254 case 'e':
9255 case 'E':
9256 case 'f':
9257 case 'F':
9258 case 'g':
9259 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 pbuf = formatbuf;
9261 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9262 flags, prec, c, v);
9263 if (len < 0)
9264 goto onError;
9265 sign = 1;
9266 if (flags & F_ZERO)
9267 fill = '0';
9268 break;
9269
9270 case 'c':
9271 pbuf = formatbuf;
9272 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9273 if (len < 0)
9274 goto onError;
9275 break;
9276
9277 default:
9278 PyErr_Format(PyExc_ValueError,
9279 "unsupported format character '%c' (0x%x) "
9280 "at index %zd",
9281 (31<=c && c<=126) ? (char)c : '?',
9282 (int)c,
9283 (Py_ssize_t)(fmt - 1 -
9284 PyUnicode_AS_UNICODE(uformat)));
9285 goto onError;
9286 }
9287 if (sign) {
9288 if (*pbuf == '-' || *pbuf == '+') {
9289 sign = *pbuf++;
9290 len--;
9291 }
9292 else if (flags & F_SIGN)
9293 sign = '+';
9294 else if (flags & F_BLANK)
9295 sign = ' ';
9296 else
9297 sign = 0;
9298 }
9299 if (width < len)
9300 width = len;
9301 if (rescnt - (sign != 0) < width) {
9302 reslen -= rescnt;
9303 rescnt = width + fmtcnt + 100;
9304 reslen += rescnt;
9305 if (reslen < 0) {
9306 Py_XDECREF(temp);
9307 PyErr_NoMemory();
9308 goto onError;
9309 }
9310 if (_PyUnicode_Resize(&result, reslen) < 0) {
9311 Py_XDECREF(temp);
9312 goto onError;
9313 }
9314 res = PyUnicode_AS_UNICODE(result)
9315 + reslen - rescnt;
9316 }
9317 if (sign) {
9318 if (fill != ' ')
9319 *res++ = sign;
9320 rescnt--;
9321 if (width > len)
9322 width--;
9323 }
9324 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9325 assert(pbuf[0] == '0');
9326 assert(pbuf[1] == c);
9327 if (fill != ' ') {
9328 *res++ = *pbuf++;
9329 *res++ = *pbuf++;
9330 }
9331 rescnt -= 2;
9332 width -= 2;
9333 if (width < 0)
9334 width = 0;
9335 len -= 2;
9336 }
9337 if (width > len && !(flags & F_LJUST)) {
9338 do {
9339 --rescnt;
9340 *res++ = fill;
9341 } while (--width > len);
9342 }
9343 if (fill == ' ') {
9344 if (sign)
9345 *res++ = sign;
9346 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9347 assert(pbuf[0] == '0');
9348 assert(pbuf[1] == c);
9349 *res++ = *pbuf++;
9350 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009351 }
9352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 Py_UNICODE_COPY(res, pbuf, len);
9354 res += len;
9355 rescnt -= len;
9356 while (--width >= len) {
9357 --rescnt;
9358 *res++ = ' ';
9359 }
9360 if (dict && (argidx < arglen) && c != '%') {
9361 PyErr_SetString(PyExc_TypeError,
9362 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009363 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 goto onError;
9365 }
9366 Py_XDECREF(temp);
9367 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 } /* until end */
9369 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 PyErr_SetString(PyExc_TypeError,
9371 "not all arguments converted during string formatting");
9372 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 }
9374
Thomas Woutersa96affe2006-03-12 00:29:36 +00009375 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 }
9380 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 return (PyObject *)result;
9382
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 Py_XDECREF(result);
9385 Py_DECREF(uformat);
9386 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 }
9389 return NULL;
9390}
9391
Jeremy Hylton938ace62002-07-17 16:30:39 +00009392static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009393unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9394
Tim Peters6d6c1a32001-08-02 04:15:00 +00009395static PyObject *
9396unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9397{
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009399 static char *kwlist[] = {"object", "encoding", "errors", 0};
9400 char *encoding = NULL;
9401 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009402
Benjamin Peterson14339b62009-01-31 16:36:08 +00009403 if (type != &PyUnicode_Type)
9404 return unicode_subtype_new(type, args, kwds);
9405 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 return NULL;
9408 if (x == NULL)
9409 return (PyObject *)_PyUnicode_New(0);
9410 if (encoding == NULL && errors == NULL)
9411 return PyObject_Str(x);
9412 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009414}
9415
Guido van Rossume023fe02001-08-30 03:12:59 +00009416static PyObject *
9417unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9418{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009419 PyUnicodeObject *tmp, *pnew;
9420 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009421
Benjamin Peterson14339b62009-01-31 16:36:08 +00009422 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9423 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9424 if (tmp == NULL)
9425 return NULL;
9426 assert(PyUnicode_Check(tmp));
9427 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9428 if (pnew == NULL) {
9429 Py_DECREF(tmp);
9430 return NULL;
9431 }
9432 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9433 if (pnew->str == NULL) {
9434 _Py_ForgetReference((PyObject *)pnew);
9435 PyObject_Del(pnew);
9436 Py_DECREF(tmp);
9437 return PyErr_NoMemory();
9438 }
9439 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9440 pnew->length = n;
9441 pnew->hash = tmp->hash;
9442 Py_DECREF(tmp);
9443 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009444}
9445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009446PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009448\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009449Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009450encoding defaults to the current default string encoding.\n\
9451errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009452
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009453static PyObject *unicode_iter(PyObject *seq);
9454
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009456 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009457 "str", /* tp_name */
9458 sizeof(PyUnicodeObject), /* tp_size */
9459 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009461 (destructor)unicode_dealloc, /* tp_dealloc */
9462 0, /* tp_print */
9463 0, /* tp_getattr */
9464 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009465 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009466 unicode_repr, /* tp_repr */
9467 &unicode_as_number, /* tp_as_number */
9468 &unicode_as_sequence, /* tp_as_sequence */
9469 &unicode_as_mapping, /* tp_as_mapping */
9470 (hashfunc) unicode_hash, /* tp_hash*/
9471 0, /* tp_call*/
9472 (reprfunc) unicode_str, /* tp_str */
9473 PyObject_GenericGetAttr, /* tp_getattro */
9474 0, /* tp_setattro */
9475 0, /* tp_as_buffer */
9476 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009478 unicode_doc, /* tp_doc */
9479 0, /* tp_traverse */
9480 0, /* tp_clear */
9481 PyUnicode_RichCompare, /* tp_richcompare */
9482 0, /* tp_weaklistoffset */
9483 unicode_iter, /* tp_iter */
9484 0, /* tp_iternext */
9485 unicode_methods, /* tp_methods */
9486 0, /* tp_members */
9487 0, /* tp_getset */
9488 &PyBaseObject_Type, /* tp_base */
9489 0, /* tp_dict */
9490 0, /* tp_descr_get */
9491 0, /* tp_descr_set */
9492 0, /* tp_dictoffset */
9493 0, /* tp_init */
9494 0, /* tp_alloc */
9495 unicode_new, /* tp_new */
9496 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497};
9498
9499/* Initialize the Unicode implementation */
9500
Thomas Wouters78890102000-07-22 19:25:51 +00009501void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009503 int i;
9504
Thomas Wouters477c8d52006-05-27 19:21:47 +00009505 /* XXX - move this array to unicodectype.c ? */
9506 Py_UNICODE linebreak[] = {
9507 0x000A, /* LINE FEED */
9508 0x000D, /* CARRIAGE RETURN */
9509 0x001C, /* FILE SEPARATOR */
9510 0x001D, /* GROUP SEPARATOR */
9511 0x001E, /* RECORD SEPARATOR */
9512 0x0085, /* NEXT LINE */
9513 0x2028, /* LINE SEPARATOR */
9514 0x2029, /* PARAGRAPH SEPARATOR */
9515 };
9516
Fred Drakee4315f52000-05-09 19:53:39 +00009517 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009518 free_list = NULL;
9519 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009521 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009523
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009524 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009526 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009528
9529 /* initialize the linebreak bloom filter */
9530 bloom_linebreak = make_bloom_mask(
9531 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9532 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009533
9534 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535}
9536
9537/* Finalize the Unicode implementation */
9538
Christian Heimesa156e092008-02-16 07:38:31 +00009539int
9540PyUnicode_ClearFreeList(void)
9541{
9542 int freelist_size = numfree;
9543 PyUnicodeObject *u;
9544
9545 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 PyUnicodeObject *v = u;
9547 u = *(PyUnicodeObject **)u;
9548 if (v->str)
9549 PyObject_DEL(v->str);
9550 Py_XDECREF(v->defenc);
9551 PyObject_Del(v);
9552 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009553 }
9554 free_list = NULL;
9555 assert(numfree == 0);
9556 return freelist_size;
9557}
9558
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559void
Thomas Wouters78890102000-07-22 19:25:51 +00009560_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009562 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009564 Py_XDECREF(unicode_empty);
9565 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009566
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009567 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 if (unicode_latin1[i]) {
9569 Py_DECREF(unicode_latin1[i]);
9570 unicode_latin1[i] = NULL;
9571 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009572 }
Christian Heimesa156e092008-02-16 07:38:31 +00009573 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009575
Walter Dörwald16807132007-05-25 13:52:07 +00009576void
9577PyUnicode_InternInPlace(PyObject **p)
9578{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009579 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9580 PyObject *t;
9581 if (s == NULL || !PyUnicode_Check(s))
9582 Py_FatalError(
9583 "PyUnicode_InternInPlace: unicode strings only please!");
9584 /* If it's a subclass, we don't really know what putting
9585 it in the interned dict might do. */
9586 if (!PyUnicode_CheckExact(s))
9587 return;
9588 if (PyUnicode_CHECK_INTERNED(s))
9589 return;
9590 if (interned == NULL) {
9591 interned = PyDict_New();
9592 if (interned == NULL) {
9593 PyErr_Clear(); /* Don't leave an exception */
9594 return;
9595 }
9596 }
9597 /* It might be that the GetItem call fails even
9598 though the key is present in the dictionary,
9599 namely when this happens during a stack overflow. */
9600 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009602 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009603
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 if (t) {
9605 Py_INCREF(t);
9606 Py_DECREF(*p);
9607 *p = t;
9608 return;
9609 }
Walter Dörwald16807132007-05-25 13:52:07 +00009610
Benjamin Peterson14339b62009-01-31 16:36:08 +00009611 PyThreadState_GET()->recursion_critical = 1;
9612 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9613 PyErr_Clear();
9614 PyThreadState_GET()->recursion_critical = 0;
9615 return;
9616 }
9617 PyThreadState_GET()->recursion_critical = 0;
9618 /* The two references in interned are not counted by refcnt.
9619 The deallocator will take care of this */
9620 Py_REFCNT(s) -= 2;
9621 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009622}
9623
9624void
9625PyUnicode_InternImmortal(PyObject **p)
9626{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 PyUnicode_InternInPlace(p);
9628 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9629 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9630 Py_INCREF(*p);
9631 }
Walter Dörwald16807132007-05-25 13:52:07 +00009632}
9633
9634PyObject *
9635PyUnicode_InternFromString(const char *cp)
9636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009637 PyObject *s = PyUnicode_FromString(cp);
9638 if (s == NULL)
9639 return NULL;
9640 PyUnicode_InternInPlace(&s);
9641 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009642}
9643
9644void _Py_ReleaseInternedUnicodeStrings(void)
9645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 PyObject *keys;
9647 PyUnicodeObject *s;
9648 Py_ssize_t i, n;
9649 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009650
Benjamin Peterson14339b62009-01-31 16:36:08 +00009651 if (interned == NULL || !PyDict_Check(interned))
9652 return;
9653 keys = PyDict_Keys(interned);
9654 if (keys == NULL || !PyList_Check(keys)) {
9655 PyErr_Clear();
9656 return;
9657 }
Walter Dörwald16807132007-05-25 13:52:07 +00009658
Benjamin Peterson14339b62009-01-31 16:36:08 +00009659 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9660 detector, interned unicode strings are not forcibly deallocated;
9661 rather, we give them their stolen references back, and then clear
9662 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009663
Benjamin Peterson14339b62009-01-31 16:36:08 +00009664 n = PyList_GET_SIZE(keys);
9665 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009667 for (i = 0; i < n; i++) {
9668 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9669 switch (s->state) {
9670 case SSTATE_NOT_INTERNED:
9671 /* XXX Shouldn't happen */
9672 break;
9673 case SSTATE_INTERNED_IMMORTAL:
9674 Py_REFCNT(s) += 1;
9675 immortal_size += s->length;
9676 break;
9677 case SSTATE_INTERNED_MORTAL:
9678 Py_REFCNT(s) += 2;
9679 mortal_size += s->length;
9680 break;
9681 default:
9682 Py_FatalError("Inconsistent interned string state.");
9683 }
9684 s->state = SSTATE_NOT_INTERNED;
9685 }
9686 fprintf(stderr, "total size of all interned strings: "
9687 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9688 "mortal/immortal\n", mortal_size, immortal_size);
9689 Py_DECREF(keys);
9690 PyDict_Clear(interned);
9691 Py_DECREF(interned);
9692 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009693}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009694
9695
9696/********************* Unicode Iterator **************************/
9697
9698typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009699 PyObject_HEAD
9700 Py_ssize_t it_index;
9701 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009702} unicodeiterobject;
9703
9704static void
9705unicodeiter_dealloc(unicodeiterobject *it)
9706{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 _PyObject_GC_UNTRACK(it);
9708 Py_XDECREF(it->it_seq);
9709 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009710}
9711
9712static int
9713unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9714{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009715 Py_VISIT(it->it_seq);
9716 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009717}
9718
9719static PyObject *
9720unicodeiter_next(unicodeiterobject *it)
9721{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009722 PyUnicodeObject *seq;
9723 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 assert(it != NULL);
9726 seq = it->it_seq;
9727 if (seq == NULL)
9728 return NULL;
9729 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009730
Benjamin Peterson14339b62009-01-31 16:36:08 +00009731 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9732 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 if (item != NULL)
9735 ++it->it_index;
9736 return item;
9737 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009738
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 Py_DECREF(seq);
9740 it->it_seq = NULL;
9741 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009742}
9743
9744static PyObject *
9745unicodeiter_len(unicodeiterobject *it)
9746{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 Py_ssize_t len = 0;
9748 if (it->it_seq)
9749 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9750 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009751}
9752
9753PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9754
9755static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009758 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009759};
9760
9761PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009762 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9763 "str_iterator", /* tp_name */
9764 sizeof(unicodeiterobject), /* tp_basicsize */
9765 0, /* tp_itemsize */
9766 /* methods */
9767 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9768 0, /* tp_print */
9769 0, /* tp_getattr */
9770 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009771 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 0, /* tp_repr */
9773 0, /* tp_as_number */
9774 0, /* tp_as_sequence */
9775 0, /* tp_as_mapping */
9776 0, /* tp_hash */
9777 0, /* tp_call */
9778 0, /* tp_str */
9779 PyObject_GenericGetAttr, /* tp_getattro */
9780 0, /* tp_setattro */
9781 0, /* tp_as_buffer */
9782 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9783 0, /* tp_doc */
9784 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9785 0, /* tp_clear */
9786 0, /* tp_richcompare */
9787 0, /* tp_weaklistoffset */
9788 PyObject_SelfIter, /* tp_iter */
9789 (iternextfunc)unicodeiter_next, /* tp_iternext */
9790 unicodeiter_methods, /* tp_methods */
9791 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009792};
9793
9794static PyObject *
9795unicode_iter(PyObject *seq)
9796{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009797 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009798
Benjamin Peterson14339b62009-01-31 16:36:08 +00009799 if (!PyUnicode_Check(seq)) {
9800 PyErr_BadInternalCall();
9801 return NULL;
9802 }
9803 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9804 if (it == NULL)
9805 return NULL;
9806 it->it_index = 0;
9807 Py_INCREF(seq);
9808 it->it_seq = (PyUnicodeObject *)seq;
9809 _PyObject_GC_TRACK(it);
9810 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009811}
9812
Martin v. Löwis5b222132007-06-10 09:51:05 +00009813size_t
9814Py_UNICODE_strlen(const Py_UNICODE *u)
9815{
9816 int res = 0;
9817 while(*u++)
9818 res++;
9819 return res;
9820}
9821
9822Py_UNICODE*
9823Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9824{
9825 Py_UNICODE *u = s1;
9826 while ((*u++ = *s2++));
9827 return s1;
9828}
9829
9830Py_UNICODE*
9831Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9832{
9833 Py_UNICODE *u = s1;
9834 while ((*u++ = *s2++))
9835 if (n-- == 0)
9836 break;
9837 return s1;
9838}
9839
9840int
9841Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9842{
9843 while (*s1 && *s2 && *s1 == *s2)
9844 s1++, s2++;
9845 if (*s1 && *s2)
9846 return (*s1 < *s2) ? -1 : +1;
9847 if (*s1)
9848 return 1;
9849 if (*s2)
9850 return -1;
9851 return 0;
9852}
9853
9854Py_UNICODE*
9855Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9856{
9857 const Py_UNICODE *p;
9858 for (p = s; *p; p++)
9859 if (*p == c)
9860 return (Py_UNICODE*)p;
9861 return NULL;
9862}
9863
9864
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009865#ifdef __cplusplus
9866}
9867#endif
9868
9869
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009870/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 Local variables:
9872 c-basic-offset: 4
9873 indent-tabs-mode: nil
9874 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009875*/