blob: bcae239b6addc8af72ee12c2bbe35cd3cf358eed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000172
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Benjamin Peterson29060642009-01-31 22:14:21 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
Benjamin Peterson29060642009-01-31 22:14:21 +0000239#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000379 case SSTATE_NOT_INTERNED:
380 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
384 Py_REFCNT(unicode) = 3;
385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
387 "deletion of interned string failed");
388 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_INTERNED_IMMORTAL:
391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 default:
394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401 PyObject_DEL(unicode->str);
402 unicode->str = NULL;
403 unicode->length = 0;
404 }
405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 PyObject_DEL(unicode->str);
416 Py_XDECREF(unicode->defenc);
417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyErr_BadInternalCall();
429 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
447 Py_DECREF(*unicode);
448 *unicode = w;
449 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
Benjamin Peterson29060642009-01-31 22:14:21 +0000471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000475 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
483 if (!unicode)
484 return NULL;
485 unicode->str[0] = *u;
486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
Benjamin Peterson14339b62009-01-31 16:36:08 +0000508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000510 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000511 return NULL;
512 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000525
526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
529 unicode = unicode_latin1[Py_CHARMASK(*u)];
530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
534 unicode->str[0] = Py_CHARMASK(*u);
535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
Mark Dickinson081dfee2009-03-18 14:47:41 +0000564#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
565# define CONVERT_WCHAR_TO_SURROGATES
566#endif
567
568#ifdef CONVERT_WCHAR_TO_SURROGATES
569
570/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
571 to convert from UTF32 to UTF16. */
572
573PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
574 Py_ssize_t size)
575{
576 PyUnicodeObject *unicode;
577 register Py_ssize_t i;
578 Py_ssize_t alloc;
579 const wchar_t *orig_w;
580
581 if (w == NULL) {
582 if (size == 0)
583 return PyUnicode_FromStringAndSize(NULL, 0);
584 PyErr_BadInternalCall();
585 return NULL;
586 }
587
588 if (size == -1) {
589 size = wcslen(w);
590 }
591
592 alloc = size;
593 orig_w = w;
594 for (i = size; i > 0; i--) {
595 if (*w > 0xFFFF)
596 alloc++;
597 w++;
598 }
599 w = orig_w;
600 unicode = _PyUnicode_New(alloc);
601 if (!unicode)
602 return NULL;
603
604 /* Copy the wchar_t data into the new object */
605 {
606 register Py_UNICODE *u;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--) {
609 if (*w > 0xFFFF) {
610 wchar_t ordinal = *w++;
611 ordinal -= 0x10000;
612 *u++ = 0xD800 | (ordinal >> 10);
613 *u++ = 0xDC00 | (ordinal & 0x3FF);
614 }
615 else
616 *u++ = *w++;
617 }
618 }
619 return (PyObject *)unicode;
620}
621
622#else
623
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000625 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626{
627 PyUnicodeObject *unicode;
628
629 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000630 if (size == 0)
631 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000632 PyErr_BadInternalCall();
633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634 }
635
Martin v. Löwis790465f2008-04-05 20:41:37 +0000636 if (size == -1) {
637 size = wcslen(w);
638 }
639
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640 unicode = _PyUnicode_New(size);
641 if (!unicode)
642 return NULL;
643
644 /* Copy the wchar_t data into the new object */
645#ifdef HAVE_USABLE_WCHAR_T
646 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000647#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 register Py_UNICODE *u;
650 register Py_ssize_t i;
651 u = PyUnicode_AS_UNICODE(unicode);
652 for (i = size; i > 0; i--)
653 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655#endif
656
657 return (PyObject *)unicode;
658}
659
Mark Dickinson081dfee2009-03-18 14:47:41 +0000660#endif /* CONVERT_WCHAR_TO_SURROGATES */
661
662#undef CONVERT_WCHAR_TO_SURROGATES
663
Walter Dörwald346737f2007-05-31 10:44:43 +0000664static void
665makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
666{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000667 *fmt++ = '%';
668 if (width) {
669 if (zeropad)
670 *fmt++ = '0';
671 fmt += sprintf(fmt, "%d", width);
672 }
673 if (precision)
674 fmt += sprintf(fmt, ".%d", precision);
675 if (longflag)
676 *fmt++ = 'l';
677 else if (size_tflag) {
678 char *f = PY_FORMAT_SIZE_T;
679 while (*f)
680 *fmt++ = *f++;
681 }
682 *fmt++ = c;
683 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000684}
685
Walter Dörwaldd2034312007-05-18 16:29:38 +0000686#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
687
688PyObject *
689PyUnicode_FromFormatV(const char *format, va_list vargs)
690{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000691 va_list count;
692 Py_ssize_t callcount = 0;
693 PyObject **callresults = NULL;
694 PyObject **callresult = NULL;
695 Py_ssize_t n = 0;
696 int width = 0;
697 int precision = 0;
698 int zeropad;
699 const char* f;
700 Py_UNICODE *s;
701 PyObject *string;
702 /* used by sprintf */
703 char buffer[21];
704 /* use abuffer instead of buffer, if we need more space
705 * (which can happen if there's a format specifier with width). */
706 char *abuffer = NULL;
707 char *realbuffer;
708 Py_ssize_t abuffersize = 0;
709 char fmt[60]; /* should be enough for %0width.precisionld */
710 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000713 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714#else
715#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000716 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#endif
720#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 /* step 1: count the number of %S/%R/%A format specifications
722 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
723 * these objects once during step 3 and put the result in
Benjamin Peterson29060642009-01-31 22:14:21 +0000724 an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 for (f = format; *f; f++) {
726 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
727 ++callcount;
728 }
729 /* step 2: allocate memory for the results of
730 * PyObject_Str()/PyObject_Repr() calls */
731 if (callcount) {
732 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
733 if (!callresults) {
734 PyErr_NoMemory();
735 return NULL;
736 }
737 callresult = callresults;
738 }
739 /* step 3: figure out how large a buffer we need */
740 for (f = format; *f; f++) {
741 if (*f == '%') {
742 const char* p = f;
743 width = 0;
744 while (ISDIGIT((unsigned)*f))
745 width = (width*10) + *f++ - '0';
746 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
747 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Benjamin Peterson14339b62009-01-31 16:36:08 +0000749 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
750 * they don't affect the amount of space we reserve.
751 */
752 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000753 (f[1] == 'd' || f[1] == 'u'))
754 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000755
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 switch (*f) {
757 case 'c':
758 (void)va_arg(count, int);
759 /* fall through... */
760 case '%':
761 n++;
762 break;
763 case 'd': case 'u': case 'i': case 'x':
764 (void) va_arg(count, int);
765 /* 20 bytes is enough to hold a 64-bit
766 integer. Decimal takes the most space.
767 This isn't enough for octal.
768 If a width is specified we need more
769 (which we allocate later). */
770 if (width < 20)
771 width = 20;
772 n += width;
773 if (abuffersize < width)
774 abuffersize = width;
775 break;
776 case 's':
777 {
778 /* UTF-8 */
779 unsigned char*s;
780 s = va_arg(count, unsigned char*);
781 while (*s) {
782 if (*s < 128) {
783 n++; s++;
784 } else if (*s < 0xc0) {
785 /* invalid UTF-8 */
786 n++; s++;
787 } else if (*s < 0xc0) {
788 n++;
789 s++; if(!*s)break;
790 s++;
791 } else if (*s < 0xe0) {
792 n++;
793 s++; if(!*s)break;
794 s++; if(!*s)break;
795 s++;
796 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000797#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 n++;
Benjamin Peterson29060642009-01-31 22:14:21 +0000799#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 n+=2;
Benjamin Peterson29060642009-01-31 22:14:21 +0000801#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 s++; if(!*s)break;
803 s++; if(!*s)break;
804 s++; if(!*s)break;
805 s++;
806 }
807 }
808 break;
809 }
810 case 'U':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 assert(obj && PyUnicode_Check(obj));
814 n += PyUnicode_GET_SIZE(obj);
815 break;
816 }
817 case 'V':
818 {
819 PyObject *obj = va_arg(count, PyObject *);
820 const char *str = va_arg(count, const char *);
821 assert(obj || str);
822 assert(!obj || PyUnicode_Check(obj));
823 if (obj)
824 n += PyUnicode_GET_SIZE(obj);
825 else
826 n += strlen(str);
827 break;
828 }
829 case 'S':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 PyObject *str;
833 assert(obj);
834 str = PyObject_Str(obj);
835 if (!str)
836 goto fail;
837 n += PyUnicode_GET_SIZE(str);
838 /* Remember the str and switch to the next slot */
839 *callresult++ = str;
840 break;
841 }
842 case 'R':
843 {
844 PyObject *obj = va_arg(count, PyObject *);
845 PyObject *repr;
846 assert(obj);
847 repr = PyObject_Repr(obj);
848 if (!repr)
849 goto fail;
850 n += PyUnicode_GET_SIZE(repr);
851 /* Remember the repr and switch to the next slot */
852 *callresult++ = repr;
853 break;
854 }
855 case 'A':
856 {
857 PyObject *obj = va_arg(count, PyObject *);
858 PyObject *ascii;
859 assert(obj);
860 ascii = PyObject_ASCII(obj);
861 if (!ascii)
862 goto fail;
863 n += PyUnicode_GET_SIZE(ascii);
864 /* Remember the repr and switch to the next slot */
865 *callresult++ = ascii;
866 break;
867 }
868 case 'p':
869 (void) va_arg(count, int);
870 /* maximum 64-bit pointer representation:
871 * 0xffffffffffffffff
872 * so 19 characters is enough.
873 * XXX I count 18 -- what's the extra for?
874 */
875 n += 19;
876 break;
877 default:
878 /* if we stumble upon an unknown
879 formatting code, copy the rest of
880 the format string to the output
881 string. (we cannot just skip the
882 code, since there's no way to know
883 what's in the argument list) */
884 n += strlen(p);
885 goto expand;
886 }
887 } else
888 n++;
889 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000890 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000891 if (abuffersize > 20) {
892 abuffer = PyObject_Malloc(abuffersize);
893 if (!abuffer) {
894 PyErr_NoMemory();
895 goto fail;
896 }
897 realbuffer = abuffer;
898 }
899 else
900 realbuffer = buffer;
901 /* step 4: fill the buffer */
902 /* Since we've analyzed how much space we need for the worst case,
903 we don't have to resize the string.
904 There can be no errors beyond this point. */
905 string = PyUnicode_FromUnicode(NULL, n);
906 if (!string)
907 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000908
Benjamin Peterson14339b62009-01-31 16:36:08 +0000909 s = PyUnicode_AS_UNICODE(string);
910 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911
Benjamin Peterson14339b62009-01-31 16:36:08 +0000912 for (f = format; *f; f++) {
913 if (*f == '%') {
914 const char* p = f++;
915 int longflag = 0;
916 int size_tflag = 0;
917 zeropad = (*f == '0');
918 /* parse the width.precision part */
919 width = 0;
920 while (ISDIGIT((unsigned)*f))
921 width = (width*10) + *f++ - '0';
922 precision = 0;
923 if (*f == '.') {
924 f++;
925 while (ISDIGIT((unsigned)*f))
926 precision = (precision*10) + *f++ - '0';
927 }
928 /* handle the long flag, but only for %ld and %lu.
929 others can be added when necessary. */
930 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
931 longflag = 1;
932 ++f;
933 }
934 /* handle the size_t flag. */
935 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
936 size_tflag = 1;
937 ++f;
938 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000939
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 switch (*f) {
941 case 'c':
942 *s++ = va_arg(vargs, int);
943 break;
944 case 'd':
945 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
946 if (longflag)
947 sprintf(realbuffer, fmt, va_arg(vargs, long));
948 else if (size_tflag)
949 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
950 else
951 sprintf(realbuffer, fmt, va_arg(vargs, int));
952 appendstring(realbuffer);
953 break;
954 case 'u':
955 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
956 if (longflag)
957 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
958 else if (size_tflag)
959 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
960 else
961 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
962 appendstring(realbuffer);
963 break;
964 case 'i':
965 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
966 sprintf(realbuffer, fmt, va_arg(vargs, int));
967 appendstring(realbuffer);
968 break;
969 case 'x':
970 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
971 sprintf(realbuffer, fmt, va_arg(vargs, int));
972 appendstring(realbuffer);
973 break;
974 case 's':
975 {
976 /* Parameter must be UTF-8 encoded.
977 In case of encoding errors, use
978 the replacement character. */
979 PyObject *u;
980 p = va_arg(vargs, char*);
981 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Peterson29060642009-01-31 22:14:21 +0000982 "replace");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000983 if (!u)
984 goto fail;
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 PyUnicode_GET_SIZE(u));
Benjamin Peterson14339b62009-01-31 16:36:08 +0000987 s += PyUnicode_GET_SIZE(u);
988 Py_DECREF(u);
989 break;
990 }
991 case 'U':
992 {
993 PyObject *obj = va_arg(vargs, PyObject *);
994 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
995 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
996 s += size;
997 break;
998 }
999 case 'V':
1000 {
1001 PyObject *obj = va_arg(vargs, PyObject *);
1002 const char *str = va_arg(vargs, const char *);
1003 if (obj) {
1004 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1005 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1006 s += size;
1007 } else {
1008 appendstring(str);
1009 }
1010 break;
1011 }
1012 case 'S':
1013 case 'R':
1014 {
1015 Py_UNICODE *ucopy;
1016 Py_ssize_t usize;
1017 Py_ssize_t upos;
1018 /* unused, since we already have the result */
1019 (void) va_arg(vargs, PyObject *);
1020 ucopy = PyUnicode_AS_UNICODE(*callresult);
1021 usize = PyUnicode_GET_SIZE(*callresult);
1022 for (upos = 0; upos<usize;)
1023 *s++ = ucopy[upos++];
1024 /* We're done with the unicode()/repr() => forget it */
1025 Py_DECREF(*callresult);
1026 /* switch to next unicode()/repr() result */
1027 ++callresult;
1028 break;
1029 }
1030 case 'p':
1031 sprintf(buffer, "%p", va_arg(vargs, void*));
1032 /* %p is ill-defined: ensure leading 0x. */
1033 if (buffer[1] == 'X')
1034 buffer[1] = 'x';
1035 else if (buffer[1] != 'x') {
1036 memmove(buffer+2, buffer, strlen(buffer)+1);
1037 buffer[0] = '0';
1038 buffer[1] = 'x';
1039 }
1040 appendstring(buffer);
1041 break;
1042 case '%':
1043 *s++ = '%';
1044 break;
1045 default:
1046 appendstring(p);
1047 goto end;
1048 }
1049 } else
1050 *s++ = *f;
1051 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001052
Benjamin Peterson29060642009-01-31 22:14:21 +00001053 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 if (callresults)
1055 PyObject_Free(callresults);
1056 if (abuffer)
1057 PyObject_Free(abuffer);
1058 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1059 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001060 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001061 if (callresults) {
1062 PyObject **callresult2 = callresults;
1063 while (callresult2 < callresult) {
1064 Py_DECREF(*callresult2);
1065 ++callresult2;
1066 }
1067 PyObject_Free(callresults);
1068 }
1069 if (abuffer)
1070 PyObject_Free(abuffer);
1071 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001072}
1073
1074#undef appendstring
1075
1076PyObject *
1077PyUnicode_FromFormat(const char *format, ...)
1078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 PyObject* ret;
1080 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001081
1082#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001083 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001084#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001085 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001086#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 ret = PyUnicode_FromFormatV(format, vargs);
1088 va_end(vargs);
1089 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001090}
1091
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001093 wchar_t *w,
1094 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095{
1096 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001097 PyErr_BadInternalCall();
1098 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001100
1101 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001103 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105#ifdef HAVE_USABLE_WCHAR_T
1106 memcpy(w, unicode->str, size * sizeof(wchar_t));
1107#else
1108 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 register Py_UNICODE *u;
1110 register Py_ssize_t i;
1111 u = PyUnicode_AS_UNICODE(unicode);
1112 for (i = size; i > 0; i--)
1113 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
1115#endif
1116
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001117 if (size > PyUnicode_GET_SIZE(unicode))
1118 return PyUnicode_GET_SIZE(unicode);
1119 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001120 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121}
1122
1123#endif
1124
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001125PyObject *PyUnicode_FromOrdinal(int ordinal)
1126{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001127 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001128
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001129 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001130 PyErr_SetString(PyExc_ValueError,
1131 "chr() arg not in range(0x110000)");
1132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001134
1135#ifndef Py_UNICODE_WIDE
1136 if (ordinal > 0xffff) {
1137 ordinal -= 0x10000;
1138 s[0] = 0xD800 | (ordinal >> 10);
1139 s[1] = 0xDC00 | (ordinal & 0x3FF);
1140 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141 }
1142#endif
1143
Hye-Shik Chang40574832004-04-06 07:24:51 +00001144 s[0] = (Py_UNICODE)ordinal;
1145 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001146}
1147
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148PyObject *PyUnicode_FromObject(register PyObject *obj)
1149{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001151 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001152 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 Py_INCREF(obj);
1154 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 }
1156 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 /* For a Unicode subtype that's not a Unicode object,
1158 return a true Unicode object with the same data. */
1159 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1160 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001161 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001162 PyErr_Format(PyExc_TypeError,
1163 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001164 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001165 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166}
1167
1168PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001169 const char *encoding,
1170 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001171{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001172 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001173 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 PyErr_BadInternalCall();
1178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001180
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001182 PyErr_SetString(PyExc_TypeError,
1183 "decoding str is not supported");
1184 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001185 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001186
1187 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001188 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 s = PyBytes_AS_STRING(obj);
1190 len = PyBytes_GET_SIZE(obj);
1191 }
1192 else if (PyByteArray_Check(obj)) {
1193 s = PyByteArray_AS_STRING(obj);
1194 len = PyByteArray_GET_SIZE(obj);
1195 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001196 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001197 /* Overwrite the error message with something more useful in
1198 case of a TypeError. */
1199 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001200 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001201 "coercing to str: need string or buffer, "
1202 "%.80s found",
1203 Py_TYPE(obj)->tp_name);
1204 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001205 }
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001209 Py_INCREF(unicode_empty);
1210 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 }
Tim Petersced69f82003-09-16 20:30:58 +00001212 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001213 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001214
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001215 return v;
1216
Benjamin Peterson29060642009-01-31 22:14:21 +00001217 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219}
1220
1221PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 Py_ssize_t size,
1223 const char *encoding,
1224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225{
1226 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001227 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001228 char lower[20]; /* Enough for any encoding name we recognize */
1229 char *l;
1230 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231
1232 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001233 encoding = PyUnicode_GetDefaultEncoding();
1234
1235 /* Convert encoding to lower case and replace '_' with '-' in order to
1236 catch e.g. UTF_8 */
1237 e = encoding;
1238 l = lower;
1239 while (*e && l < &lower[(sizeof lower) - 2]) {
1240 if (ISUPPER(*e)) {
1241 *l++ = TOLOWER(*e++);
1242 }
1243 else if (*e == '_') {
1244 *l++ = '-';
1245 e++;
1246 }
1247 else {
1248 *l++ = *e++;
1249 }
1250 }
1251 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001252
1253 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001256 else if ((strcmp(lower, "latin-1") == 0) ||
1257 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001258 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001259#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001260 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001261 return PyUnicode_DecodeMBCS(s, size, errors);
1262#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001263 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001264 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001265 else if (strcmp(lower, "utf-16") == 0)
1266 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1267 else if (strcmp(lower, "utf-32") == 0)
1268 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269
1270 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001271 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001272 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001273 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001274 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (buffer == NULL)
1276 goto onError;
1277 unicode = PyCodec_Decode(buffer, encoding, errors);
1278 if (unicode == NULL)
1279 goto onError;
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001282 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001283 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 Py_DECREF(unicode);
1285 goto onError;
1286 }
1287 Py_DECREF(buffer);
1288 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001289
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 Py_XDECREF(buffer);
1292 return NULL;
1293}
1294
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001295PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1296 const char *encoding,
1297 const char *errors)
1298{
1299 PyObject *v;
1300
1301 if (!PyUnicode_Check(unicode)) {
1302 PyErr_BadArgument();
1303 goto onError;
1304 }
1305
1306 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001308
1309 /* Decode via the codec registry */
1310 v = PyCodec_Decode(unicode, encoding, errors);
1311 if (v == NULL)
1312 goto onError;
1313 return v;
1314
Benjamin Peterson29060642009-01-31 22:14:21 +00001315 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001316 return NULL;
1317}
1318
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001319PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322{
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001332
1333 /* Decode via the codec registry */
1334 v = PyCodec_Decode(unicode, encoding, errors);
1335 if (v == NULL)
1336 goto onError;
1337 if (!PyUnicode_Check(v)) {
1338 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001339 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001340 Py_TYPE(v)->tp_name);
1341 Py_DECREF(v);
1342 goto onError;
1343 }
1344 return v;
1345
Benjamin Peterson29060642009-01-31 22:14:21 +00001346 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001347 return NULL;
1348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 Py_ssize_t size,
1352 const char *encoding,
1353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354{
1355 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001356
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 unicode = PyUnicode_FromUnicode(s, size);
1358 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1361 Py_DECREF(unicode);
1362 return v;
1363}
1364
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001365PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1366 const char *encoding,
1367 const char *errors)
1368{
1369 PyObject *v;
1370
1371 if (!PyUnicode_Check(unicode)) {
1372 PyErr_BadArgument();
1373 goto onError;
1374 }
1375
1376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001378
1379 /* Encode via the codec registry */
1380 v = PyCodec_Encode(unicode, encoding, errors);
1381 if (v == NULL)
1382 goto onError;
1383 return v;
1384
Benjamin Peterson29060642009-01-31 22:14:21 +00001385 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001386 return NULL;
1387}
1388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1390 const char *encoding,
1391 const char *errors)
1392{
1393 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001394
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 }
Fred Drakee4315f52000-05-09 19:53:39 +00001399
Tim Petersced69f82003-09-16 20:30:58 +00001400 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001401 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001402
1403 /* Shortcuts for common default encodings */
1404 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 if (strcmp(encoding, "utf-8") == 0)
1406 return PyUnicode_AsUTF8String(unicode);
1407 else if (strcmp(encoding, "latin-1") == 0)
1408 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001409#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 else if (strcmp(encoding, "mbcs") == 0)
1411 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001412#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001413 else if (strcmp(encoding, "ascii") == 0)
1414 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001415 /* During bootstrap, we may need to find the encodings
1416 package, to load the file system encoding, and require the
1417 file system encoding in order to load the encodings
1418 package.
1419
1420 Break out of this dependency by assuming that the path to
1421 the encodings module is ASCII-only. XXX could try wcstombs
1422 instead, if the file system encoding is the locale's
1423 encoding. */
1424 else if (Py_FileSystemDefaultEncoding &&
1425 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1426 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001427 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 /* Encode via the codec registry */
1431 v = PyCodec_Encode(unicode, encoding, errors);
1432 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001433 return NULL;
1434
1435 /* The normal path */
1436 if (PyBytes_Check(v))
1437 return v;
1438
1439 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001440 if (PyByteArray_Check(v)) {
1441 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001442 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001443 PyOS_snprintf(msg, sizeof(msg),
1444 "encoder %s returned buffer instead of bytes",
1445 encoding);
1446 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001447 Py_DECREF(v);
1448 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001450
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001451 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1452 Py_DECREF(v);
1453 return b;
1454 }
1455
1456 PyErr_Format(PyExc_TypeError,
1457 "encoder did not return a bytes object (type=%.400s)",
1458 Py_TYPE(v)->tp_name);
1459 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001460 return NULL;
1461}
1462
1463PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1464 const char *encoding,
1465 const char *errors)
1466{
1467 PyObject *v;
1468
1469 if (!PyUnicode_Check(unicode)) {
1470 PyErr_BadArgument();
1471 goto onError;
1472 }
1473
1474 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001476
1477 /* Encode via the codec registry */
1478 v = PyCodec_Encode(unicode, encoding, errors);
1479 if (v == NULL)
1480 goto onError;
1481 if (!PyUnicode_Check(v)) {
1482 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001483 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001484 Py_TYPE(v)->tp_name);
1485 Py_DECREF(v);
1486 goto onError;
1487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return NULL;
1492}
1493
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001494PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001496{
1497 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001498 if (v)
1499 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001500 if (errors != NULL)
1501 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001502 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001505 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001506 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001507 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001508 return v;
1509}
1510
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001511PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001512PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001513 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001514 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1515}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001516
Christian Heimes5894ba72007-11-04 11:43:14 +00001517PyObject*
1518PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1519{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001520 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1521 can be undefined. If it is case, decode using UTF-8. The following assumes
1522 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1523 bootstrapping process where the codecs aren't ready yet.
1524 */
1525 if (Py_FileSystemDefaultEncoding) {
1526#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001527 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001528 return PyUnicode_DecodeMBCS(s, size, "replace");
1529 }
1530#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001531 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001532 return PyUnicode_DecodeUTF8(s, size, "replace");
1533 }
1534#endif
1535 return PyUnicode_Decode(s, size,
1536 Py_FileSystemDefaultEncoding,
1537 "replace");
1538 }
1539 else {
1540 return PyUnicode_DecodeUTF8(s, size, "replace");
1541 }
1542}
1543
Martin v. Löwis5b222132007-06-10 09:51:05 +00001544char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001545_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001546{
Christian Heimesf3863112007-11-22 07:46:41 +00001547 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001548 if (!PyUnicode_Check(unicode)) {
1549 PyErr_BadArgument();
1550 return NULL;
1551 }
Christian Heimesf3863112007-11-22 07:46:41 +00001552 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1553 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001554 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001555 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001556 *psize = PyBytes_GET_SIZE(bytes);
1557 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001558}
1559
1560char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001561_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001562{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001563 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001564}
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1567{
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572 return PyUnicode_AS_UNICODE(unicode);
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 return NULL;
1576}
1577
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579{
1580 if (!PyUnicode_Check(unicode)) {
1581 PyErr_BadArgument();
1582 goto onError;
1583 }
1584 return PyUnicode_GET_SIZE(unicode);
1585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 return -1;
1588}
1589
Thomas Wouters78890102000-07-22 19:25:51 +00001590const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001591{
1592 return unicode_default_encoding;
1593}
1594
1595int PyUnicode_SetDefaultEncoding(const char *encoding)
1596{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001597 if (strcmp(encoding, unicode_default_encoding) != 0) {
1598 PyErr_Format(PyExc_ValueError,
1599 "Can only set default encoding to %s",
1600 unicode_default_encoding);
1601 return -1;
1602 }
Fred Drakee4315f52000-05-09 19:53:39 +00001603 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001604}
1605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606/* error handling callback helper:
1607 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001608 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001609 and adjust various state variables.
1610 return 0 on success, -1 on error
1611*/
1612
1613static
1614int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 const char *encoding, const char *reason,
1616 const char **input, const char **inend, Py_ssize_t *startinpos,
1617 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1618 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001620 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 PyObject *restuple = NULL;
1623 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001624 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001625 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001626 Py_ssize_t requiredsize;
1627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001629 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001630 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 int res = -1;
1632
1633 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001634 *errorHandler = PyCodec_LookupError(errors);
1635 if (*errorHandler == NULL)
1636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 }
1638
1639 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001640 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001641 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1642 if (*exceptionObject == NULL)
1643 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 }
1645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001646 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1647 goto onError;
1648 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1649 goto onError;
1650 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 }
1653
1654 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1655 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001656 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001658 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 }
1661 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001663
1664 /* Copy back the bytes variables, which might have been modified by the
1665 callback */
1666 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1667 if (!inputobj)
1668 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001669 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001671 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001672 *input = PyBytes_AS_STRING(inputobj);
1673 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001674 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001675 /* we can DECREF safely, as the exception has another reference,
1676 so the object won't go away. */
1677 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001680 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001681 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001682 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1683 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001685
1686 /* need more space? (at least enough for what we
1687 have+the replacement+the rest of the string (starting
1688 at the new input position), so we won't have to check space
1689 when there are no errors in the rest of the string) */
1690 repptr = PyUnicode_AS_UNICODE(repunicode);
1691 repsize = PyUnicode_GET_SIZE(repunicode);
1692 requiredsize = *outpos + repsize + insize-newpos;
1693 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001694 if (requiredsize<2*outsize)
1695 requiredsize = 2*outsize;
1696 if (_PyUnicode_Resize(output, requiredsize) < 0)
1697 goto onError;
1698 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 }
1700 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001701 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 Py_UNICODE_COPY(*outptr, repptr, repsize);
1703 *outptr += repsize;
1704 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 /* we made it! */
1707 res = 0;
1708
Benjamin Peterson29060642009-01-31 22:14:21 +00001709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 Py_XDECREF(restuple);
1711 return res;
1712}
1713
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714/* --- UTF-7 Codec -------------------------------------------------------- */
1715
1716/* see RFC2152 for details */
1717
Tim Petersced69f82003-09-16 20:30:58 +00001718static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719char utf7_special[128] = {
1720 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1721 encoded:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001722 0 - not special
1723 1 - special
1724 2 - whitespace (optional)
1725 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1728 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1730 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1732 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1734
1735};
1736
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001737/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1738 warnings about the comparison always being false; since
1739 utf7_special[0] is 1, we can safely make that one comparison
1740 true */
1741
Benjamin Peterson29060642009-01-31 22:14:21 +00001742#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001743 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001745 (encodeO && (utf7_special[(c)] == 3)))
1746
Benjamin Peterson29060642009-01-31 22:14:21 +00001747#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001748 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson29060642009-01-31 22:14:21 +00001749#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001750 (ISALNUM(c) || (c) == '+' || (c) == '/')
Benjamin Peterson29060642009-01-31 22:14:21 +00001751#define UB64(c) \
1752 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001753 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001755#define ENCODE(out, ch, bits) \
1756 while (bits >= 6) { \
1757 *out++ = B64(ch >> (bits-6)); \
1758 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 }
1760
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001761#define DECODE(out, ch, bits, surrogate) \
1762 while (bits >= 16) { \
1763 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1764 bits -= 16; \
1765 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001766 /* We have already generated an error for the high surrogate \
1767 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001768 surrogate = 0; \
1769 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001771 it in a 16-bit character */ \
1772 surrogate = 1; \
1773 errmsg = "code pairs are not supported"; \
1774 goto utf7Error; \
1775 } else { \
1776 *out++ = outCh; \
1777 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001778 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 Py_ssize_t size,
1782 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001784 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1785}
1786
1787PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 Py_ssize_t size,
1789 const char *errors,
1790 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001793 Py_ssize_t startinpos;
1794 Py_ssize_t endinpos;
1795 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001796 const char *e;
1797 PyUnicodeObject *unicode;
1798 Py_UNICODE *p;
1799 const char *errmsg = "";
1800 int inShift = 0;
1801 unsigned int bitsleft = 0;
1802 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 int surrogate = 0;
1804 PyObject *errorHandler = NULL;
1805 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001806
1807 unicode = _PyUnicode_New(size);
1808 if (!unicode)
1809 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001810 if (size == 0) {
1811 if (consumed)
1812 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001814 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815
1816 p = unicode->str;
1817 e = s + size;
1818
1819 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001822 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823
1824 if (inShift) {
1825 if ((ch == '-') || !B64CHAR(ch)) {
1826 inShift = 0;
1827 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001828
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1830 if (bitsleft >= 6) {
1831 /* The shift sequence has a partial character in it. If
1832 bitsleft < 6 then we could just classify it as padding
1833 but that is not the case here */
1834
1835 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001836 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 }
1838 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001839 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 here so indicate the potential of a misencoded character. */
1841
1842 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1843 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1844 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001845 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 }
1847
1848 if (ch == '-') {
1849 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001850 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 inShift = 1;
1852 }
1853 } else if (SPECIAL(ch,0,0)) {
1854 errmsg = "unexpected special character";
Benjamin Peterson14339b62009-01-31 16:36:08 +00001855 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 } else {
1857 *p++ = ch;
1858 }
1859 } else {
1860 charsleft = (charsleft << 6) | UB64(ch);
1861 bitsleft += 6;
1862 s++;
1863 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1864 }
1865 }
1866 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001868 s++;
1869 if (s < e && *s == '-') {
1870 s++;
1871 *p++ = '+';
1872 } else
1873 {
1874 inShift = 1;
1875 bitsleft = 0;
1876 }
1877 }
1878 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001879 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880 errmsg = "unexpected special character";
1881 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001882 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883 }
1884 else {
1885 *p++ = ch;
1886 s++;
1887 }
1888 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00001889 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 outpos = p-PyUnicode_AS_UNICODE(unicode);
1891 endinpos = s-starts;
1892 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 errors, &errorHandler,
1894 "utf7", errmsg,
1895 &starts, &e, &startinpos, &endinpos, &exc, &s,
1896 &unicode, &outpos, &p))
1897 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001898 }
1899
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001900 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001901 outpos = p-PyUnicode_AS_UNICODE(unicode);
1902 endinpos = size;
1903 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 errors, &errorHandler,
1905 "utf7", "unterminated shift sequence",
1906 &starts, &e, &startinpos, &endinpos, &exc, &s,
1907 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001908 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 if (s < e)
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001912 if (consumed) {
1913 if(inShift)
1914 *consumed = startinpos;
1915 else
1916 *consumed = s-starts;
1917 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001918
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001919 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001920 goto onError;
1921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 Py_XDECREF(errorHandler);
1923 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001924 return (PyObject *)unicode;
1925
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 Py_XDECREF(errorHandler);
1928 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001929 Py_DECREF(unicode);
1930 return NULL;
1931}
1932
1933
1934PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001935 Py_ssize_t size,
1936 int encodeSetO,
1937 int encodeWhiteSpace,
1938 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001940 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001943 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001944 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 unsigned int bitsleft = 0;
1946 unsigned long charsleft = 0;
1947 char * out;
1948 char * start;
1949
1950 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001953 if (cbAllocated / 5 != size)
1954 return PyErr_NoMemory();
1955
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001956 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (v == NULL)
1958 return NULL;
1959
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001960 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001961 for (;i < size; ++i) {
1962 Py_UNICODE ch = s[i];
1963
1964 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001965 if (ch == '+') {
1966 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001967 *out++ = '-';
1968 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1969 charsleft = ch;
1970 bitsleft = 16;
1971 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001972 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001973 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001974 } else {
1975 *out++ = (char) ch;
1976 }
1977 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1979 *out++ = B64(charsleft << (6-bitsleft));
1980 charsleft = 0;
1981 bitsleft = 0;
1982 /* Characters not in the BASE64 set implicitly unshift the sequence
1983 so no '-' is required, except if the character is itself a '-' */
1984 if (B64CHAR(ch) || ch == '-') {
1985 *out++ = '-';
1986 }
1987 inShift = 0;
1988 *out++ = (char) ch;
1989 } else {
1990 bitsleft += 16;
1991 charsleft = (charsleft << 16) | ch;
1992 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1993
Mark Dickinson934896d2009-02-21 20:59:32 +00001994 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001995 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996 or '-' then the shift sequence will be terminated implicitly and we
1997 don't have to insert a '-'. */
1998
1999 if (bitsleft == 0) {
2000 if (i + 1 < size) {
2001 Py_UNICODE ch2 = s[i+1];
2002
2003 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002005 } else if (B64CHAR(ch2) || ch2 == '-') {
2006 *out++ = '-';
2007 inShift = 0;
2008 } else {
2009 inShift = 0;
2010 }
2011
2012 }
2013 else {
2014 *out++ = '-';
2015 inShift = 0;
2016 }
2017 }
Tim Petersced69f82003-09-16 20:30:58 +00002018 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002019 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002020 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 if (bitsleft) {
2022 *out++= B64(charsleft << (6-bitsleft) );
2023 *out++ = '-';
2024 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002025 if (_PyBytes_Resize(&v, out - start) < 0)
2026 return NULL;
2027 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028}
2029
2030#undef SPECIAL
2031#undef B64
2032#undef B64CHAR
2033#undef UB64
2034#undef ENCODE
2035#undef DECODE
2036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037/* --- UTF-8 Codec -------------------------------------------------------- */
2038
Tim Petersced69f82003-09-16 20:30:58 +00002039static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040char utf8_code_length[256] = {
2041 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2042 illegal prefix. see RFC 2279 for details */
2043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2051 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2055 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2056 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2057 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2058 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2059};
2060
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 Py_ssize_t size,
2063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064{
Walter Dörwald69652032004-09-07 20:24:22 +00002065 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2066}
2067
Antoine Pitrouab868312009-01-10 15:40:25 +00002068/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2069#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2070
2071/* Mask to quickly check whether a C 'long' contains a
2072 non-ASCII, UTF8-encoded char. */
2073#if (SIZEOF_LONG == 8)
2074# define ASCII_CHAR_MASK 0x8080808080808080L
2075#elif (SIZEOF_LONG == 4)
2076# define ASCII_CHAR_MASK 0x80808080L
2077#else
2078# error C 'long' size should be either 4 or 8!
2079#endif
2080
Walter Dörwald69652032004-09-07 20:24:22 +00002081PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 Py_ssize_t size,
2083 const char *errors,
2084 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t startinpos;
2089 Py_ssize_t endinpos;
2090 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002091 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 PyUnicodeObject *unicode;
2093 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002094 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 PyObject *errorHandler = NULL;
2096 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 /* Note: size will always be longer than the resulting Unicode
2099 character count */
2100 unicode = _PyUnicode_New(size);
2101 if (!unicode)
2102 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002103 if (size == 0) {
2104 if (consumed)
2105 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
2109 /* Unpack UTF-8 encoded data */
2110 p = unicode->str;
2111 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002112 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113
2114 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116
2117 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002118 /* Fast path for runs of ASCII characters. Given that common UTF-8
2119 input will consist of an overwhelming majority of ASCII
2120 characters, we try to optimize for this case by checking
2121 as many characters as a C 'long' can contain.
2122 First, check if we can do an aligned read, as most CPUs have
2123 a penalty for unaligned reads.
2124 */
2125 if (!((size_t) s & LONG_PTR_MASK)) {
2126 /* Help register allocation */
2127 register const char *_s = s;
2128 register Py_UNICODE *_p = p;
2129 while (_s < aligned_end) {
2130 /* Read a whole long at a time (either 4 or 8 bytes),
2131 and do a fast unrolled copy if it only contains ASCII
2132 characters. */
2133 unsigned long data = *(unsigned long *) _s;
2134 if (data & ASCII_CHAR_MASK)
2135 break;
2136 _p[0] = (unsigned char) _s[0];
2137 _p[1] = (unsigned char) _s[1];
2138 _p[2] = (unsigned char) _s[2];
2139 _p[3] = (unsigned char) _s[3];
2140#if (SIZEOF_LONG == 8)
2141 _p[4] = (unsigned char) _s[4];
2142 _p[5] = (unsigned char) _s[5];
2143 _p[6] = (unsigned char) _s[6];
2144 _p[7] = (unsigned char) _s[7];
2145#endif
2146 _s += SIZEOF_LONG;
2147 _p += SIZEOF_LONG;
2148 }
2149 s = _s;
2150 p = _p;
2151 if (s == e)
2152 break;
2153 ch = (unsigned char)*s;
2154 }
2155 }
2156
2157 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002158 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 s++;
2160 continue;
2161 }
2162
2163 n = utf8_code_length[ch];
2164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002165 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 if (consumed)
2167 break;
2168 else {
2169 errmsg = "unexpected end of data";
2170 startinpos = s-starts;
2171 endinpos = size;
2172 goto utf8Error;
2173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175
2176 switch (n) {
2177
2178 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002179 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 startinpos = s-starts;
2181 endinpos = startinpos+1;
2182 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183
2184 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002185 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002186 startinpos = s-starts;
2187 endinpos = startinpos+1;
2188 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
2190 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002191 if ((s[1] & 0xc0) != 0x80) {
2192 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002193 startinpos = s-starts;
2194 endinpos = startinpos+2;
2195 goto utf8Error;
2196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002198 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002199 startinpos = s-starts;
2200 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002201 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002202 goto utf8Error;
2203 }
2204 else
2205 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 break;
2207
2208 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002209 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002210 (s[2] & 0xc0) != 0x80) {
2211 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002212 startinpos = s-starts;
2213 endinpos = startinpos+3;
2214 goto utf8Error;
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002217 if (ch < 0x0800) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 /* Note: UTF-8 encodings of surrogates are considered
2219 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002220
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 XXX For wide builds (UCS-4) we should probably try
2222 to recombine the surrogates into a single code
2223 unit.
2224 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002225 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002226 startinpos = s-starts;
2227 endinpos = startinpos+3;
2228 goto utf8Error;
2229 }
2230 else
2231 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002232 break;
2233
2234 case 4:
2235 if ((s[1] & 0xc0) != 0x80 ||
2236 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002237 (s[3] & 0xc0) != 0x80) {
2238 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 startinpos = s-starts;
2240 endinpos = startinpos+4;
2241 goto utf8Error;
2242 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002244 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002245 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002246 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002247 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002248 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 UTF-16 */
2250 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002251 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 startinpos = s-starts;
2253 endinpos = startinpos+4;
2254 goto utf8Error;
2255 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002256#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002257 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002258#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002260
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 /* translate from 10000..10FFFF to 0..FFFF */
2262 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002263
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002264 /* high surrogate = top 10 bits added to D800 */
2265 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002266
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002267 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002268 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002269#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 break;
2271
2272 default:
2273 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002274 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 startinpos = s-starts;
2276 endinpos = startinpos+n;
2277 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 }
2279 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002280 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002281
Benjamin Peterson29060642009-01-31 22:14:21 +00002282 utf8Error:
2283 outpos = p-PyUnicode_AS_UNICODE(unicode);
2284 if (unicode_decode_call_errorhandler(
2285 errors, &errorHandler,
2286 "utf8", errmsg,
2287 &starts, &e, &startinpos, &endinpos, &exc, &s,
2288 &unicode, &outpos, &p))
2289 goto onError;
2290 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
Walter Dörwald69652032004-09-07 20:24:22 +00002292 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002293 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294
2295 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002296 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 goto onError;
2298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 Py_XDECREF(errorHandler);
2300 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 return (PyObject *)unicode;
2302
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002304 Py_XDECREF(errorHandler);
2305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 Py_DECREF(unicode);
2307 return NULL;
2308}
2309
Antoine Pitrouab868312009-01-10 15:40:25 +00002310#undef ASCII_CHAR_MASK
2311
2312
Tim Peters602f7402002-04-27 18:03:26 +00002313/* Allocation strategy: if the string is short, convert into a stack buffer
2314 and allocate exactly as much space needed at the end. Else allocate the
2315 maximum possible needed (4 result bytes per Unicode character), and return
2316 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002317*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002318PyObject *
2319PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002320 Py_ssize_t size,
2321 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322{
Tim Peters602f7402002-04-27 18:03:26 +00002323#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002324
Guido van Rossum98297ee2007-11-06 21:34:58 +00002325 Py_ssize_t i; /* index into s of next input byte */
2326 PyObject *result; /* result string object */
2327 char *p; /* next free byte in output buffer */
2328 Py_ssize_t nallocated; /* number of result bytes allocated */
2329 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002330 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002331
Tim Peters602f7402002-04-27 18:03:26 +00002332 assert(s != NULL);
2333 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Tim Peters602f7402002-04-27 18:03:26 +00002335 if (size <= MAX_SHORT_UNICHARS) {
2336 /* Write into the stack buffer; nallocated can't overflow.
2337 * At the end, we'll allocate exactly as much heap space as it
2338 * turns out we need.
2339 */
2340 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002341 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002342 p = stackbuf;
2343 }
2344 else {
2345 /* Overallocate on the heap, and give the excess back at the end. */
2346 nallocated = size * 4;
2347 if (nallocated / 4 != size) /* overflow! */
2348 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002349 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002350 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002351 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002352 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002353 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002354
Tim Peters602f7402002-04-27 18:03:26 +00002355 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002356 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002357
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002358 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002359 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002361
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002363 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002364 *p++ = (char)(0xc0 | (ch >> 6));
2365 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002366 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002367 else {
Tim Peters602f7402002-04-27 18:03:26 +00002368 /* Encode UCS2 Unicode ordinals */
2369 if (ch < 0x10000) {
2370 /* Special case: check for high surrogate */
2371 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2372 Py_UCS4 ch2 = s[i];
2373 /* Check for low surrogate and combine the two to
2374 form a UCS4 value */
2375 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002376 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002377 i++;
2378 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002379 }
Tim Peters602f7402002-04-27 18:03:26 +00002380 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002381 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002382 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002383 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2384 *p++ = (char)(0x80 | (ch & 0x3f));
2385 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002388 /* Encode UCS4 Unicode ordinals */
2389 *p++ = (char)(0xf0 | (ch >> 18));
2390 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2391 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2392 *p++ = (char)(0x80 | (ch & 0x3f));
2393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002395
Guido van Rossum98297ee2007-11-06 21:34:58 +00002396 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002397 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002398 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002399 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002400 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002401 }
2402 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002403 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002404 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002405 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002406 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002408 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002409
Tim Peters602f7402002-04-27 18:03:26 +00002410#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411}
2412
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2414{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 if (!PyUnicode_Check(unicode)) {
2416 PyErr_BadArgument();
2417 return NULL;
2418 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002419 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 PyUnicode_GET_SIZE(unicode),
2421 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422}
2423
Walter Dörwald41980ca2007-08-16 21:55:45 +00002424/* --- UTF-32 Codec ------------------------------------------------------- */
2425
2426PyObject *
2427PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002428 Py_ssize_t size,
2429 const char *errors,
2430 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002431{
2432 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2433}
2434
2435PyObject *
2436PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 Py_ssize_t size,
2438 const char *errors,
2439 int *byteorder,
2440 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002441{
2442 const char *starts = s;
2443 Py_ssize_t startinpos;
2444 Py_ssize_t endinpos;
2445 Py_ssize_t outpos;
2446 PyUnicodeObject *unicode;
2447 Py_UNICODE *p;
2448#ifndef Py_UNICODE_WIDE
2449 int i, pairs;
2450#else
2451 const int pairs = 0;
2452#endif
2453 const unsigned char *q, *e;
2454 int bo = 0; /* assume native ordering by default */
2455 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002456 /* Offsets from q for retrieving bytes in the right order. */
2457#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2458 int iorder[] = {0, 1, 2, 3};
2459#else
2460 int iorder[] = {3, 2, 1, 0};
2461#endif
2462 PyObject *errorHandler = NULL;
2463 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002464 /* On narrow builds we split characters outside the BMP into two
2465 codepoints => count how much extra space we need. */
2466#ifndef Py_UNICODE_WIDE
2467 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002468 if (((Py_UCS4 *)s)[i] >= 0x10000)
2469 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002470#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002471
2472 /* This might be one to much, because of a BOM */
2473 unicode = _PyUnicode_New((size+3)/4+pairs);
2474 if (!unicode)
2475 return NULL;
2476 if (size == 0)
2477 return (PyObject *)unicode;
2478
2479 /* Unpack UTF-32 encoded data */
2480 p = unicode->str;
2481 q = (unsigned char *)s;
2482 e = q + size;
2483
2484 if (byteorder)
2485 bo = *byteorder;
2486
2487 /* Check for BOM marks (U+FEFF) in the input and adjust current
2488 byte order setting accordingly. In native mode, the leading BOM
2489 mark is skipped, in all other modes, it is copied to the output
2490 stream as-is (giving a ZWNBSP character). */
2491 if (bo == 0) {
2492 if (size >= 4) {
2493 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002495#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 if (bom == 0x0000FEFF) {
2497 q += 4;
2498 bo = -1;
2499 }
2500 else if (bom == 0xFFFE0000) {
2501 q += 4;
2502 bo = 1;
2503 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002504#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 if (bom == 0x0000FEFF) {
2506 q += 4;
2507 bo = 1;
2508 }
2509 else if (bom == 0xFFFE0000) {
2510 q += 4;
2511 bo = -1;
2512 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002513#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002515 }
2516
2517 if (bo == -1) {
2518 /* force LE */
2519 iorder[0] = 0;
2520 iorder[1] = 1;
2521 iorder[2] = 2;
2522 iorder[3] = 3;
2523 }
2524 else if (bo == 1) {
2525 /* force BE */
2526 iorder[0] = 3;
2527 iorder[1] = 2;
2528 iorder[2] = 1;
2529 iorder[3] = 0;
2530 }
2531
2532 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_UCS4 ch;
2534 /* remaining bytes at the end? (size should be divisible by 4) */
2535 if (e-q<4) {
2536 if (consumed)
2537 break;
2538 errmsg = "truncated data";
2539 startinpos = ((const char *)q)-starts;
2540 endinpos = ((const char *)e)-starts;
2541 goto utf32Error;
2542 /* The remaining input chars are ignored if the callback
2543 chooses to skip the input */
2544 }
2545 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2546 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002547
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 if (ch >= 0x110000)
2549 {
2550 errmsg = "codepoint not in range(0x110000)";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = startinpos+4;
2553 goto utf32Error;
2554 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002555#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 if (ch >= 0x10000)
2557 {
2558 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2559 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2560 }
2561 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002563 *p++ = ch;
2564 q += 4;
2565 continue;
2566 utf32Error:
2567 outpos = p-PyUnicode_AS_UNICODE(unicode);
2568 if (unicode_decode_call_errorhandler(
2569 errors, &errorHandler,
2570 "utf32", errmsg,
2571 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2572 &unicode, &outpos, &p))
2573 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002574 }
2575
2576 if (byteorder)
2577 *byteorder = bo;
2578
2579 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002581
2582 /* Adjust length */
2583 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2584 goto onError;
2585
2586 Py_XDECREF(errorHandler);
2587 Py_XDECREF(exc);
2588 return (PyObject *)unicode;
2589
Benjamin Peterson29060642009-01-31 22:14:21 +00002590 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002591 Py_DECREF(unicode);
2592 Py_XDECREF(errorHandler);
2593 Py_XDECREF(exc);
2594 return NULL;
2595}
2596
2597PyObject *
2598PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 Py_ssize_t size,
2600 const char *errors,
2601 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002602{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002603 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002604 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002605 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002606#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002607 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002608#else
2609 const int pairs = 0;
2610#endif
2611 /* Offsets from p for storing byte pairs in the right order. */
2612#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2613 int iorder[] = {0, 1, 2, 3};
2614#else
2615 int iorder[] = {3, 2, 1, 0};
2616#endif
2617
Benjamin Peterson29060642009-01-31 22:14:21 +00002618#define STORECHAR(CH) \
2619 do { \
2620 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2621 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2622 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2623 p[iorder[0]] = (CH) & 0xff; \
2624 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002625 } while(0)
2626
2627 /* In narrow builds we can output surrogate pairs as one codepoint,
2628 so we need less space. */
2629#ifndef Py_UNICODE_WIDE
2630 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002631 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2632 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2633 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002634#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002635 nsize = (size - pairs + (byteorder == 0));
2636 bytesize = nsize * 4;
2637 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002639 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640 if (v == NULL)
2641 return NULL;
2642
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002643 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002644 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002646 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002647 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002648
2649 if (byteorder == -1) {
2650 /* force LE */
2651 iorder[0] = 0;
2652 iorder[1] = 1;
2653 iorder[2] = 2;
2654 iorder[3] = 3;
2655 }
2656 else if (byteorder == 1) {
2657 /* force BE */
2658 iorder[0] = 3;
2659 iorder[1] = 2;
2660 iorder[2] = 1;
2661 iorder[3] = 0;
2662 }
2663
2664 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002666#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002667 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2668 Py_UCS4 ch2 = *s;
2669 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2670 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2671 s++;
2672 size--;
2673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002675#endif
2676 STORECHAR(ch);
2677 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002678
2679 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002680 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002681#undef STORECHAR
2682}
2683
2684PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2685{
2686 if (!PyUnicode_Check(unicode)) {
2687 PyErr_BadArgument();
2688 return NULL;
2689 }
2690 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 PyUnicode_GET_SIZE(unicode),
2692 NULL,
2693 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002694}
2695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696/* --- UTF-16 Codec ------------------------------------------------------- */
2697
Tim Peters772747b2001-08-09 22:21:55 +00002698PyObject *
2699PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 Py_ssize_t size,
2701 const char *errors,
2702 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703{
Walter Dörwald69652032004-09-07 20:24:22 +00002704 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2705}
2706
Antoine Pitrouab868312009-01-10 15:40:25 +00002707/* Two masks for fast checking of whether a C 'long' may contain
2708 UTF16-encoded surrogate characters. This is an efficient heuristic,
2709 assuming that non-surrogate characters with a code point >= 0x8000 are
2710 rare in most input.
2711 FAST_CHAR_MASK is used when the input is in native byte ordering,
2712 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002713*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002714#if (SIZEOF_LONG == 8)
2715# define FAST_CHAR_MASK 0x8000800080008000L
2716# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2717#elif (SIZEOF_LONG == 4)
2718# define FAST_CHAR_MASK 0x80008000L
2719# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2720#else
2721# error C 'long' size should be either 4 or 8!
2722#endif
2723
Walter Dörwald69652032004-09-07 20:24:22 +00002724PyObject *
2725PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002726 Py_ssize_t size,
2727 const char *errors,
2728 int *byteorder,
2729 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t startinpos;
2733 Py_ssize_t endinpos;
2734 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyUnicodeObject *unicode;
2736 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002737 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002738 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002739 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002740 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002741 /* Offsets from q for retrieving byte pairs in the right order. */
2742#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2743 int ihi = 1, ilo = 0;
2744#else
2745 int ihi = 0, ilo = 1;
2746#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749
2750 /* Note: size will always be longer than the resulting Unicode
2751 character count */
2752 unicode = _PyUnicode_New(size);
2753 if (!unicode)
2754 return NULL;
2755 if (size == 0)
2756 return (PyObject *)unicode;
2757
2758 /* Unpack UTF-16 encoded data */
2759 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002760 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002761 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762
2763 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002764 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002766 /* Check for BOM marks (U+FEFF) in the input and adjust current
2767 byte order setting accordingly. In native mode, the leading BOM
2768 mark is skipped, in all other modes, it is copied to the output
2769 stream as-is (giving a ZWNBSP character). */
2770 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002771 if (size >= 2) {
2772 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002773#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002774 if (bom == 0xFEFF) {
2775 q += 2;
2776 bo = -1;
2777 }
2778 else if (bom == 0xFFFE) {
2779 q += 2;
2780 bo = 1;
2781 }
Tim Petersced69f82003-09-16 20:30:58 +00002782#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 if (bom == 0xFEFF) {
2784 q += 2;
2785 bo = 1;
2786 }
2787 else if (bom == 0xFFFE) {
2788 q += 2;
2789 bo = -1;
2790 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002791#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794
Tim Peters772747b2001-08-09 22:21:55 +00002795 if (bo == -1) {
2796 /* force LE */
2797 ihi = 1;
2798 ilo = 0;
2799 }
2800 else if (bo == 1) {
2801 /* force BE */
2802 ihi = 0;
2803 ilo = 1;
2804 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002805#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2806 native_ordering = ilo < ihi;
2807#else
2808 native_ordering = ilo > ihi;
2809#endif
Tim Peters772747b2001-08-09 22:21:55 +00002810
Antoine Pitrouab868312009-01-10 15:40:25 +00002811 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002812 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002813 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002814 /* First check for possible aligned read of a C 'long'. Unaligned
2815 reads are more expensive, better to defer to another iteration. */
2816 if (!((size_t) q & LONG_PTR_MASK)) {
2817 /* Fast path for runs of non-surrogate chars. */
2818 register const unsigned char *_q = q;
2819 Py_UNICODE *_p = p;
2820 if (native_ordering) {
2821 /* Native ordering is simple: as long as the input cannot
2822 possibly contain a surrogate char, do an unrolled copy
2823 of several 16-bit code points to the target object.
2824 The non-surrogate check is done on several input bytes
2825 at a time (as many as a C 'long' can contain). */
2826 while (_q < aligned_end) {
2827 unsigned long data = * (unsigned long *) _q;
2828 if (data & FAST_CHAR_MASK)
2829 break;
2830 _p[0] = ((unsigned short *) _q)[0];
2831 _p[1] = ((unsigned short *) _q)[1];
2832#if (SIZEOF_LONG == 8)
2833 _p[2] = ((unsigned short *) _q)[2];
2834 _p[3] = ((unsigned short *) _q)[3];
2835#endif
2836 _q += SIZEOF_LONG;
2837 _p += SIZEOF_LONG / 2;
2838 }
2839 }
2840 else {
2841 /* Byteswapped ordering is similar, but we must decompose
2842 the copy bytewise, and take care of zero'ing out the
2843 upper bytes if the target object is in 32-bit units
2844 (that is, in UCS-4 builds). */
2845 while (_q < aligned_end) {
2846 unsigned long data = * (unsigned long *) _q;
2847 if (data & SWAPPED_FAST_CHAR_MASK)
2848 break;
2849 /* Zero upper bytes in UCS-4 builds */
2850#if (Py_UNICODE_SIZE > 2)
2851 _p[0] = 0;
2852 _p[1] = 0;
2853#if (SIZEOF_LONG == 8)
2854 _p[2] = 0;
2855 _p[3] = 0;
2856#endif
2857#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002858 /* Issue #4916; UCS-4 builds on big endian machines must
2859 fill the two last bytes of each 4-byte unit. */
2860#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2861# define OFF 2
2862#else
2863# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002864#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002865 ((unsigned char *) _p)[OFF + 1] = _q[0];
2866 ((unsigned char *) _p)[OFF + 0] = _q[1];
2867 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2868 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2869#if (SIZEOF_LONG == 8)
2870 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2871 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2872 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2873 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2874#endif
2875#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002876 _q += SIZEOF_LONG;
2877 _p += SIZEOF_LONG / 2;
2878 }
2879 }
2880 p = _p;
2881 q = _q;
2882 if (q >= e)
2883 break;
2884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002886
Benjamin Peterson14339b62009-01-31 16:36:08 +00002887 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002888
2889 if (ch < 0xD800 || ch > 0xDFFF) {
2890 *p++ = ch;
2891 continue;
2892 }
2893
2894 /* UTF-16 code pair: */
2895 if (q > e) {
2896 errmsg = "unexpected end of data";
2897 startinpos = (((const char *)q) - 2) - starts;
2898 endinpos = ((const char *)e) + 1 - starts;
2899 goto utf16Error;
2900 }
2901 if (0xD800 <= ch && ch <= 0xDBFF) {
2902 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2903 q += 2;
2904 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002905#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 *p++ = ch;
2907 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002908#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002910#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 continue;
2912 }
2913 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002914 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 startinpos = (((const char *)q)-4)-starts;
2916 endinpos = startinpos+2;
2917 goto utf16Error;
2918 }
2919
Benjamin Peterson14339b62009-01-31 16:36:08 +00002920 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 errmsg = "illegal encoding";
2922 startinpos = (((const char *)q)-2)-starts;
2923 endinpos = startinpos+2;
2924 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002925
Benjamin Peterson29060642009-01-31 22:14:21 +00002926 utf16Error:
2927 outpos = p - PyUnicode_AS_UNICODE(unicode);
2928 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002929 errors,
2930 &errorHandler,
2931 "utf16", errmsg,
2932 &starts,
2933 (const char **)&e,
2934 &startinpos,
2935 &endinpos,
2936 &exc,
2937 (const char **)&q,
2938 &unicode,
2939 &outpos,
2940 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002943 /* remaining byte at the end? (size should be even) */
2944 if (e == q) {
2945 if (!consumed) {
2946 errmsg = "truncated data";
2947 startinpos = ((const char *)q) - starts;
2948 endinpos = ((const char *)e) + 1 - starts;
2949 outpos = p - PyUnicode_AS_UNICODE(unicode);
2950 if (unicode_decode_call_errorhandler(
2951 errors,
2952 &errorHandler,
2953 "utf16", errmsg,
2954 &starts,
2955 (const char **)&e,
2956 &startinpos,
2957 &endinpos,
2958 &exc,
2959 (const char **)&q,
2960 &unicode,
2961 &outpos,
2962 &p))
2963 goto onError;
2964 /* The remaining input chars are ignored if the callback
2965 chooses to skip the input */
2966 }
2967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 if (byteorder)
2970 *byteorder = bo;
2971
Walter Dörwald69652032004-09-07 20:24:22 +00002972 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002974
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002976 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 goto onError;
2978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 Py_XDECREF(errorHandler);
2980 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 return (PyObject *)unicode;
2982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 Py_XDECREF(errorHandler);
2986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 return NULL;
2988}
2989
Antoine Pitrouab868312009-01-10 15:40:25 +00002990#undef FAST_CHAR_MASK
2991#undef SWAPPED_FAST_CHAR_MASK
2992
Tim Peters772747b2001-08-09 22:21:55 +00002993PyObject *
2994PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 Py_ssize_t size,
2996 const char *errors,
2997 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002999 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003000 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003001 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003002#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003003 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003004#else
3005 const int pairs = 0;
3006#endif
Tim Peters772747b2001-08-09 22:21:55 +00003007 /* Offsets from p for storing byte pairs in the right order. */
3008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3009 int ihi = 1, ilo = 0;
3010#else
3011 int ihi = 0, ilo = 1;
3012#endif
3013
Benjamin Peterson29060642009-01-31 22:14:21 +00003014#define STORECHAR(CH) \
3015 do { \
3016 p[ihi] = ((CH) >> 8) & 0xff; \
3017 p[ilo] = (CH) & 0xff; \
3018 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003019 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003021#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003022 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 if (s[i] >= 0x10000)
3024 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003025#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003026 /* 2 * (size + pairs + (byteorder == 0)) */
3027 if (size > PY_SSIZE_T_MAX ||
3028 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003030 nsize = size + pairs + (byteorder == 0);
3031 bytesize = nsize * 2;
3032 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003034 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (v == NULL)
3036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003038 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003041 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003042 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003043
3044 if (byteorder == -1) {
3045 /* force LE */
3046 ihi = 1;
3047 ilo = 0;
3048 }
3049 else if (byteorder == 1) {
3050 /* force BE */
3051 ihi = 0;
3052 ilo = 1;
3053 }
3054
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003055 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 Py_UNICODE ch = *s++;
3057 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003058#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 if (ch >= 0x10000) {
3060 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3061 ch = 0xD800 | ((ch-0x10000) >> 10);
3062 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003063#endif
Tim Peters772747b2001-08-09 22:21:55 +00003064 STORECHAR(ch);
3065 if (ch2)
3066 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003067 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003068
3069 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003070 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003071#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072}
3073
3074PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3075{
3076 if (!PyUnicode_Check(unicode)) {
3077 PyErr_BadArgument();
3078 return NULL;
3079 }
3080 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 PyUnicode_GET_SIZE(unicode),
3082 NULL,
3083 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084}
3085
3086/* --- Unicode Escape Codec ----------------------------------------------- */
3087
Fredrik Lundh06d12682001-01-24 07:59:11 +00003088static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 Py_ssize_t size,
3092 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003095 Py_ssize_t startinpos;
3096 Py_ssize_t endinpos;
3097 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003102 char* message;
3103 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 PyObject *errorHandler = NULL;
3105 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 /* Escaped strings will always be longer than the resulting
3108 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 length after conversion to the true value.
3110 (but if the error callback returns a long replacement string
3111 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 v = _PyUnicode_New(size);
3113 if (v == NULL)
3114 goto onError;
3115 if (size == 0)
3116 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 while (s < end) {
3122 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003123 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 /* Non-escape characters are interpreted as Unicode ordinals */
3127 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003128 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 continue;
3130 }
3131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 /* \ - Escapes */
3134 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003135 c = *s++;
3136 if (s > end)
3137 c = '\0'; /* Invalid after \ */
3138 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139
Benjamin Peterson29060642009-01-31 22:14:21 +00003140 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 case '\n': break;
3142 case '\\': *p++ = '\\'; break;
3143 case '\'': *p++ = '\''; break;
3144 case '\"': *p++ = '\"'; break;
3145 case 'b': *p++ = '\b'; break;
3146 case 'f': *p++ = '\014'; break; /* FF */
3147 case 't': *p++ = '\t'; break;
3148 case 'n': *p++ = '\n'; break;
3149 case 'r': *p++ = '\r'; break;
3150 case 'v': *p++ = '\013'; break; /* VT */
3151 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 case '0': case '1': case '2': case '3':
3155 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003156 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003157 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003158 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003159 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003162 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 break;
3164
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 /* hex escapes */
3166 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003168 digits = 2;
3169 message = "truncated \\xXX escape";
3170 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003174 digits = 4;
3175 message = "truncated \\uXXXX escape";
3176 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003179 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003180 digits = 8;
3181 message = "truncated \\UXXXXXXXX escape";
3182 hexescape:
3183 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 outpos = p-PyUnicode_AS_UNICODE(v);
3185 if (s+digits>end) {
3186 endinpos = size;
3187 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 errors, &errorHandler,
3189 "unicodeescape", "end of string in escape sequence",
3190 &starts, &end, &startinpos, &endinpos, &exc, &s,
3191 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 goto onError;
3193 goto nextByte;
3194 }
3195 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003196 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003197 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198 endinpos = (s+i+1)-starts;
3199 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 errors, &errorHandler,
3201 "unicodeescape", message,
3202 &starts, &end, &startinpos, &endinpos, &exc, &s,
3203 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003204 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003206 }
3207 chr = (chr<<4) & ~0xF;
3208 if (c >= '0' && c <= '9')
3209 chr += c - '0';
3210 else if (c >= 'a' && c <= 'f')
3211 chr += 10 + c - 'a';
3212 else
3213 chr += 10 + c - 'A';
3214 }
3215 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003216 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* _decoding_error will have already written into the
3218 target buffer. */
3219 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003220 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003221 /* when we get here, chr is a 32-bit unicode character */
3222 if (chr <= 0xffff)
3223 /* UCS-2 character */
3224 *p++ = (Py_UNICODE) chr;
3225 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003226 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003227 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003228#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003229 *p++ = chr;
3230#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003231 chr -= 0x10000L;
3232 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003233 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003234#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003235 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 endinpos = s-starts;
3237 outpos = p-PyUnicode_AS_UNICODE(v);
3238 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 errors, &errorHandler,
3240 "unicodeescape", "illegal Unicode character",
3241 &starts, &end, &startinpos, &endinpos, &exc, &s,
3242 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003243 goto onError;
3244 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003245 break;
3246
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003248 case 'N':
3249 message = "malformed \\N character escape";
3250 if (ucnhash_CAPI == NULL) {
3251 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003252 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003253 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003254 if (m == NULL)
3255 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003256 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003257 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003258 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003259 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003260 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003261 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003262 if (ucnhash_CAPI == NULL)
3263 goto ucnhashError;
3264 }
3265 if (*s == '{') {
3266 const char *start = s+1;
3267 /* look for the closing brace */
3268 while (*s != '}' && s < end)
3269 s++;
3270 if (s > start && s < end && *s == '}') {
3271 /* found a name. look it up in the unicode database */
3272 message = "unknown Unicode character name";
3273 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003274 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003275 goto store;
3276 }
3277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 endinpos = s-starts;
3279 outpos = p-PyUnicode_AS_UNICODE(v);
3280 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 errors, &errorHandler,
3282 "unicodeescape", message,
3283 &starts, &end, &startinpos, &endinpos, &exc, &s,
3284 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003285 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003286 break;
3287
3288 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003289 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 message = "\\ at end of string";
3291 s--;
3292 endinpos = s-starts;
3293 outpos = p-PyUnicode_AS_UNICODE(v);
3294 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 errors, &errorHandler,
3296 "unicodeescape", message,
3297 &starts, &end, &startinpos, &endinpos, &exc, &s,
3298 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003299 goto onError;
3300 }
3301 else {
3302 *p++ = '\\';
3303 *p++ = (unsigned char)s[-1];
3304 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003305 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003317 PyErr_SetString(
3318 PyExc_UnicodeError,
3319 "\\N escapes not supported (can't load unicodedata module)"
3320 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003321 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 Py_XDECREF(errorHandler);
3323 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003324 return NULL;
3325
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 Py_XDECREF(errorHandler);
3329 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 return NULL;
3331}
3332
3333/* Return a Unicode-Escape string version of the Unicode object.
3334
3335 If quotes is true, the string is enclosed in u"" or u'' quotes as
3336 appropriate.
3337
3338*/
3339
Thomas Wouters477c8d52006-05-27 19:21:47 +00003340Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 Py_ssize_t size,
3342 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003343{
3344 /* like wcschr, but doesn't stop at NULL characters */
3345
3346 while (size-- > 0) {
3347 if (*s == ch)
3348 return s;
3349 s++;
3350 }
3351
3352 return NULL;
3353}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003354
Walter Dörwald79e913e2007-05-12 11:08:06 +00003355static const char *hexdigits = "0123456789abcdef";
3356
3357PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003360 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003363#ifdef Py_UNICODE_WIDE
3364 const Py_ssize_t expandsize = 10;
3365#else
3366 const Py_ssize_t expandsize = 6;
3367#endif
3368
Thomas Wouters89f507f2006-12-13 04:49:30 +00003369 /* XXX(nnorwitz): rather than over-allocating, it would be
3370 better to choose a different scheme. Perhaps scan the
3371 first N-chars of the string and allocate based on that size.
3372 */
3373 /* Initial allocation is based on the longest-possible unichr
3374 escape.
3375
3376 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3377 unichr, so in this case it's the longest unichr escape. In
3378 narrow (UTF-16) builds this is five chars per source unichr
3379 since there are two unichrs in the surrogate pair, so in narrow
3380 (UTF-16) builds it's not the longest unichr escape.
3381
3382 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3383 so in the narrow (UTF-16) build case it's the longest unichr
3384 escape.
3385 */
3386
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003387 if (size == 0)
3388 return PyBytes_FromStringAndSize(NULL, 0);
3389
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003390 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003392
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003393 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 2
3395 + expandsize*size
3396 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (repr == NULL)
3398 return NULL;
3399
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003400 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 while (size-- > 0) {
3403 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003404
Walter Dörwald79e913e2007-05-12 11:08:06 +00003405 /* Escape backslashes */
3406 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 *p++ = '\\';
3408 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003409 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003410 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003411
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003412#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003413 /* Map 21-bit characters to '\U00xxxxxx' */
3414 else if (ch >= 0x10000) {
3415 *p++ = '\\';
3416 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003417 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3418 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3419 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3420 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3421 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3422 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3423 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3424 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003426 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003427#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3429 else if (ch >= 0xD800 && ch < 0xDC00) {
3430 Py_UNICODE ch2;
3431 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003432
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 ch2 = *s++;
3434 size--;
3435 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3436 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3437 *p++ = '\\';
3438 *p++ = 'U';
3439 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3440 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3441 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3442 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3443 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3444 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3445 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3446 *p++ = hexdigits[ucs & 0x0000000F];
3447 continue;
3448 }
3449 /* Fall through: isolated surrogates are copied as-is */
3450 s--;
3451 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003452 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003453#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003454
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003456 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 *p++ = '\\';
3458 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003459 *p++ = hexdigits[(ch >> 12) & 0x000F];
3460 *p++ = hexdigits[(ch >> 8) & 0x000F];
3461 *p++ = hexdigits[(ch >> 4) & 0x000F];
3462 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003464
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003465 /* Map special whitespace to '\t', \n', '\r' */
3466 else if (ch == '\t') {
3467 *p++ = '\\';
3468 *p++ = 't';
3469 }
3470 else if (ch == '\n') {
3471 *p++ = '\\';
3472 *p++ = 'n';
3473 }
3474 else if (ch == '\r') {
3475 *p++ = '\\';
3476 *p++ = 'r';
3477 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003478
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003479 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003480 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003482 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003483 *p++ = hexdigits[(ch >> 4) & 0x000F];
3484 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003485 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003486
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 /* Copy everything else as-is */
3488 else
3489 *p++ = (char) ch;
3490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003492 assert(p - PyBytes_AS_STRING(repr) > 0);
3493 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3494 return NULL;
3495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496}
3497
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003498PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003500 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 if (!PyUnicode_Check(unicode)) {
3502 PyErr_BadArgument();
3503 return NULL;
3504 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003505 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3506 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003507 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508}
3509
3510/* --- Raw Unicode Escape Codec ------------------------------------------- */
3511
3512PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 Py_ssize_t size,
3514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t startinpos;
3518 Py_ssize_t endinpos;
3519 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 const char *end;
3523 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 /* Escaped strings will always be longer than the resulting
3528 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 length after conversion to the true value. (But decoding error
3530 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 v = _PyUnicode_New(size);
3532 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 end = s + size;
3538 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 unsigned char c;
3540 Py_UCS4 x;
3541 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003542 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 /* Non-escape characters are interpreted as Unicode ordinals */
3545 if (*s != '\\') {
3546 *p++ = (unsigned char)*s++;
3547 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 startinpos = s-starts;
3550
3551 /* \u-escapes are only interpreted iff the number of leading
3552 backslashes if odd */
3553 bs = s;
3554 for (;s < end;) {
3555 if (*s != '\\')
3556 break;
3557 *p++ = (unsigned char)*s++;
3558 }
3559 if (((s - bs) & 1) == 0 ||
3560 s >= end ||
3561 (*s != 'u' && *s != 'U')) {
3562 continue;
3563 }
3564 p--;
3565 count = *s=='u' ? 4 : 8;
3566 s++;
3567
3568 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3569 outpos = p-PyUnicode_AS_UNICODE(v);
3570 for (x = 0, i = 0; i < count; ++i, ++s) {
3571 c = (unsigned char)*s;
3572 if (!ISXDIGIT(c)) {
3573 endinpos = s-starts;
3574 if (unicode_decode_call_errorhandler(
3575 errors, &errorHandler,
3576 "rawunicodeescape", "truncated \\uXXXX",
3577 &starts, &end, &startinpos, &endinpos, &exc, &s,
3578 &v, &outpos, &p))
3579 goto onError;
3580 goto nextByte;
3581 }
3582 x = (x<<4) & ~0xF;
3583 if (c >= '0' && c <= '9')
3584 x += c - '0';
3585 else if (c >= 'a' && c <= 'f')
3586 x += 10 + c - 'a';
3587 else
3588 x += 10 + c - 'A';
3589 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003590 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 /* UCS-2 character */
3592 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003593 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003594 /* UCS-4 character. Either store directly, or as
3595 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003596#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003598#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 x -= 0x10000L;
3600 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3601 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003602#endif
3603 } else {
3604 endinpos = s-starts;
3605 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003606 if (unicode_decode_call_errorhandler(
3607 errors, &errorHandler,
3608 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 &starts, &end, &startinpos, &endinpos, &exc, &s,
3610 &v, &outpos, &p))
3611 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 nextByte:
3614 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003616 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 Py_XDECREF(errorHandler);
3619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003621
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 Py_XDECREF(errorHandler);
3625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return NULL;
3627}
3628
3629PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003632 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 char *p;
3634 char *q;
3635
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003636#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003637 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003638#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003639 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003640#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003641
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003645 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (repr == NULL)
3647 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003648 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003649 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003651 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 while (size-- > 0) {
3653 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003654#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 /* Map 32-bit characters to '\Uxxxxxxxx' */
3656 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003657 *p++ = '\\';
3658 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003659 *p++ = hexdigits[(ch >> 28) & 0xf];
3660 *p++ = hexdigits[(ch >> 24) & 0xf];
3661 *p++ = hexdigits[(ch >> 20) & 0xf];
3662 *p++ = hexdigits[(ch >> 16) & 0xf];
3663 *p++ = hexdigits[(ch >> 12) & 0xf];
3664 *p++ = hexdigits[(ch >> 8) & 0xf];
3665 *p++ = hexdigits[(ch >> 4) & 0xf];
3666 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003667 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003668 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003669#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3671 if (ch >= 0xD800 && ch < 0xDC00) {
3672 Py_UNICODE ch2;
3673 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003674
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 ch2 = *s++;
3676 size--;
3677 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3678 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3679 *p++ = '\\';
3680 *p++ = 'U';
3681 *p++ = hexdigits[(ucs >> 28) & 0xf];
3682 *p++ = hexdigits[(ucs >> 24) & 0xf];
3683 *p++ = hexdigits[(ucs >> 20) & 0xf];
3684 *p++ = hexdigits[(ucs >> 16) & 0xf];
3685 *p++ = hexdigits[(ucs >> 12) & 0xf];
3686 *p++ = hexdigits[(ucs >> 8) & 0xf];
3687 *p++ = hexdigits[(ucs >> 4) & 0xf];
3688 *p++ = hexdigits[ucs & 0xf];
3689 continue;
3690 }
3691 /* Fall through: isolated surrogates are copied as-is */
3692 s--;
3693 size++;
3694 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003695#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 /* Map 16-bit characters to '\uxxxx' */
3697 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 *p++ = '\\';
3699 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003700 *p++ = hexdigits[(ch >> 12) & 0xf];
3701 *p++ = hexdigits[(ch >> 8) & 0xf];
3702 *p++ = hexdigits[(ch >> 4) & 0xf];
3703 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 /* Copy everything else as-is */
3706 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 *p++ = (char) ch;
3708 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003709 size = p - q;
3710
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003711 assert(size > 0);
3712 if (_PyBytes_Resize(&repr, size) < 0)
3713 return NULL;
3714 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3718{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003719 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003721 PyErr_BadArgument();
3722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003724 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3725 PyUnicode_GET_SIZE(unicode));
3726
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003727 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728}
3729
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003730/* --- Unicode Internal Codec ------------------------------------------- */
3731
3732PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 Py_ssize_t size,
3734 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003735{
3736 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t startinpos;
3738 Py_ssize_t endinpos;
3739 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003740 PyUnicodeObject *v;
3741 Py_UNICODE *p;
3742 const char *end;
3743 const char *reason;
3744 PyObject *errorHandler = NULL;
3745 PyObject *exc = NULL;
3746
Neal Norwitzd43069c2006-01-08 01:12:10 +00003747#ifdef Py_UNICODE_WIDE
3748 Py_UNICODE unimax = PyUnicode_GetMax();
3749#endif
3750
Thomas Wouters89f507f2006-12-13 04:49:30 +00003751 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003752 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3753 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003755 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003757 p = PyUnicode_AS_UNICODE(v);
3758 end = s + size;
3759
3760 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003761 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003762 /* We have to sanity check the raw data, otherwise doom looms for
3763 some malformed UCS-4 data. */
3764 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003765#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003766 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003767#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003768 end-s < Py_UNICODE_SIZE
3769 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003771 startinpos = s - starts;
3772 if (end-s < Py_UNICODE_SIZE) {
3773 endinpos = end-starts;
3774 reason = "truncated input";
3775 }
3776 else {
3777 endinpos = s - starts + Py_UNICODE_SIZE;
3778 reason = "illegal code point (> 0x10FFFF)";
3779 }
3780 outpos = p - PyUnicode_AS_UNICODE(v);
3781 if (unicode_decode_call_errorhandler(
3782 errors, &errorHandler,
3783 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003785 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003786 goto onError;
3787 }
3788 }
3789 else {
3790 p++;
3791 s += Py_UNICODE_SIZE;
3792 }
3793 }
3794
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003795 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003796 goto onError;
3797 Py_XDECREF(errorHandler);
3798 Py_XDECREF(exc);
3799 return (PyObject *)v;
3800
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003802 Py_XDECREF(v);
3803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
3805 return NULL;
3806}
3807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808/* --- Latin-1 Codec ------------------------------------------------------ */
3809
3810PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 Py_ssize_t size,
3812 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813{
3814 PyUnicodeObject *v;
3815 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003816 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003819 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 Py_UNICODE r = *(unsigned char*)s;
3821 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003822 }
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 v = _PyUnicode_New(size);
3825 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003830 e = s + size;
3831 /* Unrolling the copy makes it much faster by reducing the looping
3832 overhead. This is similar to what many memcpy() implementations do. */
3833 unrolled_end = e - 4;
3834 while (s < unrolled_end) {
3835 p[0] = (unsigned char) s[0];
3836 p[1] = (unsigned char) s[1];
3837 p[2] = (unsigned char) s[2];
3838 p[3] = (unsigned char) s[3];
3839 s += 4;
3840 p += 4;
3841 }
3842 while (s < e)
3843 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003845
Benjamin Peterson29060642009-01-31 22:14:21 +00003846 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 Py_XDECREF(v);
3848 return NULL;
3849}
3850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851/* create or adjust a UnicodeEncodeError */
3852static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 const char *encoding,
3854 const Py_UNICODE *unicode, Py_ssize_t size,
3855 Py_ssize_t startpos, Py_ssize_t endpos,
3856 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 *exceptionObject = PyUnicodeEncodeError_Create(
3860 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 }
3862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3864 goto onError;
3865 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3866 goto onError;
3867 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3868 goto onError;
3869 return;
3870 onError:
3871 Py_DECREF(*exceptionObject);
3872 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 }
3874}
3875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876/* raises a UnicodeEncodeError */
3877static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 const char *encoding,
3879 const Py_UNICODE *unicode, Py_ssize_t size,
3880 Py_ssize_t startpos, Py_ssize_t endpos,
3881 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882{
3883 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887}
3888
3889/* error handling callback helper:
3890 build arguments, call the callback and check the arguments,
3891 put the result into newpos and return the replacement string, which
3892 has to be freed by the caller */
3893static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 PyObject **errorHandler,
3895 const char *encoding, const char *reason,
3896 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3897 Py_ssize_t startpos, Py_ssize_t endpos,
3898 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003900 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901
3902 PyObject *restuple;
3903 PyObject *resunicode;
3904
3905 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 }
3910
3911 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915
3916 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003921 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 Py_DECREF(restuple);
3923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 }
3925 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 &resunicode, newpos)) {
3927 Py_DECREF(restuple);
3928 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
3930 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003931 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003932 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3934 Py_DECREF(restuple);
3935 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 Py_INCREF(resunicode);
3938 Py_DECREF(restuple);
3939 return resunicode;
3940}
3941
3942static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_ssize_t size,
3944 const char *errors,
3945 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946{
3947 /* output object */
3948 PyObject *res;
3949 /* pointers to the beginning and end+1 of input */
3950 const Py_UNICODE *startp = p;
3951 const Py_UNICODE *endp = p + size;
3952 /* pointer to the beginning of the unencodable characters */
3953 /* const Py_UNICODE *badp = NULL; */
3954 /* pointer into the output */
3955 char *str;
3956 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003958 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3959 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 PyObject *errorHandler = NULL;
3961 PyObject *exc = NULL;
3962 /* the following variable is used for caching string comparisons
3963 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3964 int known_errorHandler = -1;
3965
3966 /* allocate enough for a simple encoding without
3967 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003968 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003969 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003970 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003972 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003973 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 ressize = size;
3975
3976 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 /* can we encode this? */
3980 if (c<limit) {
3981 /* no overflow check, because we know that the space is enough */
3982 *str++ = (char)c;
3983 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 else {
3986 Py_ssize_t unicodepos = p-startp;
3987 Py_ssize_t requiredsize;
3988 PyObject *repunicode;
3989 Py_ssize_t repsize;
3990 Py_ssize_t newpos;
3991 Py_ssize_t respos;
3992 Py_UNICODE *uni2;
3993 /* startpos for collecting unencodable chars */
3994 const Py_UNICODE *collstart = p;
3995 const Py_UNICODE *collend = p;
3996 /* find all unecodable characters */
3997 while ((collend < endp) && ((*collend)>=limit))
3998 ++collend;
3999 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4000 if (known_errorHandler==-1) {
4001 if ((errors==NULL) || (!strcmp(errors, "strict")))
4002 known_errorHandler = 1;
4003 else if (!strcmp(errors, "replace"))
4004 known_errorHandler = 2;
4005 else if (!strcmp(errors, "ignore"))
4006 known_errorHandler = 3;
4007 else if (!strcmp(errors, "xmlcharrefreplace"))
4008 known_errorHandler = 4;
4009 else
4010 known_errorHandler = 0;
4011 }
4012 switch (known_errorHandler) {
4013 case 1: /* strict */
4014 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4015 goto onError;
4016 case 2: /* replace */
4017 while (collstart++<collend)
4018 *str++ = '?'; /* fall through */
4019 case 3: /* ignore */
4020 p = collend;
4021 break;
4022 case 4: /* xmlcharrefreplace */
4023 respos = str - PyBytes_AS_STRING(res);
4024 /* determine replacement size (temporarily (mis)uses p) */
4025 for (p = collstart, repsize = 0; p < collend; ++p) {
4026 if (*p<10)
4027 repsize += 2+1+1;
4028 else if (*p<100)
4029 repsize += 2+2+1;
4030 else if (*p<1000)
4031 repsize += 2+3+1;
4032 else if (*p<10000)
4033 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004034#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 else
4036 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 else if (*p<100000)
4039 repsize += 2+5+1;
4040 else if (*p<1000000)
4041 repsize += 2+6+1;
4042 else
4043 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 }
4046 requiredsize = respos+repsize+(endp-collend);
4047 if (requiredsize > ressize) {
4048 if (requiredsize<2*ressize)
4049 requiredsize = 2*ressize;
4050 if (_PyBytes_Resize(&res, requiredsize))
4051 goto onError;
4052 str = PyBytes_AS_STRING(res) + respos;
4053 ressize = requiredsize;
4054 }
4055 /* generate replacement (temporarily (mis)uses p) */
4056 for (p = collstart; p < collend; ++p) {
4057 str += sprintf(str, "&#%d;", (int)*p);
4058 }
4059 p = collend;
4060 break;
4061 default:
4062 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4063 encoding, reason, startp, size, &exc,
4064 collstart-startp, collend-startp, &newpos);
4065 if (repunicode == NULL)
4066 goto onError;
4067 /* need more space? (at least enough for what we
4068 have+the replacement+the rest of the string, so
4069 we won't have to check space for encodable characters) */
4070 respos = str - PyBytes_AS_STRING(res);
4071 repsize = PyUnicode_GET_SIZE(repunicode);
4072 requiredsize = respos+repsize+(endp-collend);
4073 if (requiredsize > ressize) {
4074 if (requiredsize<2*ressize)
4075 requiredsize = 2*ressize;
4076 if (_PyBytes_Resize(&res, requiredsize)) {
4077 Py_DECREF(repunicode);
4078 goto onError;
4079 }
4080 str = PyBytes_AS_STRING(res) + respos;
4081 ressize = requiredsize;
4082 }
4083 /* check if there is anything unencodable in the replacement
4084 and copy it to the output */
4085 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4086 c = *uni2;
4087 if (c >= limit) {
4088 raise_encode_exception(&exc, encoding, startp, size,
4089 unicodepos, unicodepos+1, reason);
4090 Py_DECREF(repunicode);
4091 goto onError;
4092 }
4093 *str = (char)c;
4094 }
4095 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004096 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004098 }
4099 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004100 /* Resize if we allocated to much */
4101 size = str - PyBytes_AS_STRING(res);
4102 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004103 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004104 if (_PyBytes_Resize(&res, size) < 0)
4105 goto onError;
4106 }
4107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004110 return res;
4111
4112 onError:
4113 Py_XDECREF(res);
4114 Py_XDECREF(errorHandler);
4115 Py_XDECREF(exc);
4116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117}
4118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 Py_ssize_t size,
4121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124}
4125
4126PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4127{
4128 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 PyErr_BadArgument();
4130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 }
4132 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 PyUnicode_GET_SIZE(unicode),
4134 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135}
4136
4137/* --- 7-bit ASCII Codec -------------------------------------------------- */
4138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 Py_ssize_t size,
4141 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 PyUnicodeObject *v;
4145 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004146 Py_ssize_t startinpos;
4147 Py_ssize_t endinpos;
4148 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 const char *e;
4150 PyObject *errorHandler = NULL;
4151 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004152
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004154 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 Py_UNICODE r = *(unsigned char*)s;
4156 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004157 }
Tim Petersced69f82003-09-16 20:30:58 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 v = _PyUnicode_New(size);
4160 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 e = s + size;
4166 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 register unsigned char c = (unsigned char)*s;
4168 if (c < 128) {
4169 *p++ = c;
4170 ++s;
4171 }
4172 else {
4173 startinpos = s-starts;
4174 endinpos = startinpos + 1;
4175 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4176 if (unicode_decode_call_errorhandler(
4177 errors, &errorHandler,
4178 "ascii", "ordinal not in range(128)",
4179 &starts, &e, &startinpos, &endinpos, &exc, &s,
4180 &v, &outpos, &p))
4181 goto onError;
4182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004184 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4186 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 Py_XDECREF(errorHandler);
4188 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004190
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 Py_XDECREF(errorHandler);
4194 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 return NULL;
4196}
4197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 Py_ssize_t size,
4200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203}
4204
4205PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4206{
4207 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 PyErr_BadArgument();
4209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
4211 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 PyUnicode_GET_SIZE(unicode),
4213 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214}
4215
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004216#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004217
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004218/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004219
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004220#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004221#define NEED_RETRY
4222#endif
4223
4224/* XXX This code is limited to "true" double-byte encodings, as
4225 a) it assumes an incomplete character consists of a single byte, and
4226 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004228
4229static int is_dbcs_lead_byte(const char *s, int offset)
4230{
4231 const char *curr = s + offset;
4232
4233 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 const char *prev = CharPrev(s, curr);
4235 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004236 }
4237 return 0;
4238}
4239
4240/*
4241 * Decode MBCS string into unicode object. If 'final' is set, converts
4242 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4243 */
4244static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 const char *s, /* MBCS string */
4246 int size, /* sizeof MBCS string */
4247 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004248{
4249 Py_UNICODE *p;
4250 Py_ssize_t n = 0;
4251 int usize = 0;
4252
4253 assert(size >= 0);
4254
4255 /* Skip trailing lead-byte unless 'final' is set */
4256 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004258
4259 /* First get the size of the result */
4260 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4262 if (usize == 0) {
4263 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4264 return -1;
4265 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004266 }
4267
4268 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004269 /* Create unicode object */
4270 *v = _PyUnicode_New(usize);
4271 if (*v == NULL)
4272 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004273 }
4274 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 /* Extend unicode object */
4276 n = PyUnicode_GET_SIZE(*v);
4277 if (_PyUnicode_Resize(v, n + usize) < 0)
4278 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004279 }
4280
4281 /* Do the conversion */
4282 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 p = PyUnicode_AS_UNICODE(*v) + n;
4284 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4285 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4286 return -1;
4287 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004288 }
4289
4290 return size;
4291}
4292
4293PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_ssize_t size,
4295 const char *errors,
4296 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004297{
4298 PyUnicodeObject *v = NULL;
4299 int done;
4300
4301 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004303
4304#ifdef NEED_RETRY
4305 retry:
4306 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004308 else
4309#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004311
4312 if (done < 0) {
4313 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004315 }
4316
4317 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004319
4320#ifdef NEED_RETRY
4321 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 s += done;
4323 size -= done;
4324 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004325 }
4326#endif
4327
4328 return (PyObject *)v;
4329}
4330
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004331PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 Py_ssize_t size,
4333 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004334{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004335 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4336}
4337
4338/*
4339 * Convert unicode into string object (MBCS).
4340 * Returns 0 if succeed, -1 otherwise.
4341 */
4342static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 const Py_UNICODE *p, /* unicode */
4344 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004345{
4346 int mbcssize = 0;
4347 Py_ssize_t n = 0;
4348
4349 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004350
4351 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004352 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4354 if (mbcssize == 0) {
4355 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4356 return -1;
4357 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004358 }
4359
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004360 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 /* Create string object */
4362 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4363 if (*repr == NULL)
4364 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004365 }
4366 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 /* Extend string object */
4368 n = PyBytes_Size(*repr);
4369 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4370 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004371 }
4372
4373 /* Do the conversion */
4374 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 char *s = PyBytes_AS_STRING(*repr) + n;
4376 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4377 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4378 return -1;
4379 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004380 }
4381
4382 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004383}
4384
4385PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 Py_ssize_t size,
4387 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004388{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004389 PyObject *repr = NULL;
4390 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004391
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004392#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004394 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004396 else
4397#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004399
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004400 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 Py_XDECREF(repr);
4402 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004403 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004404
4405#ifdef NEED_RETRY
4406 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 p += INT_MAX;
4408 size -= INT_MAX;
4409 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004410 }
4411#endif
4412
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004413 return repr;
4414}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004415
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004416PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4417{
4418 if (!PyUnicode_Check(unicode)) {
4419 PyErr_BadArgument();
4420 return NULL;
4421 }
4422 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 PyUnicode_GET_SIZE(unicode),
4424 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004425}
4426
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004427#undef NEED_RETRY
4428
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004429#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004430
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431/* --- Character Mapping Codec -------------------------------------------- */
4432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 Py_ssize_t size,
4435 PyObject *mapping,
4436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 Py_ssize_t startinpos;
4440 Py_ssize_t endinpos;
4441 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 PyUnicodeObject *v;
4444 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 PyObject *errorHandler = NULL;
4447 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004448 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 /* Default to Latin-1 */
4452 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
4455 v = _PyUnicode_New(size);
4456 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004462 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 mapstring = PyUnicode_AS_UNICODE(mapping);
4464 maplen = PyUnicode_GET_SIZE(mapping);
4465 while (s < e) {
4466 unsigned char ch = *s;
4467 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 if (ch < maplen)
4470 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 if (x == 0xfffe) {
4473 /* undefined mapping */
4474 outpos = p-PyUnicode_AS_UNICODE(v);
4475 startinpos = s-starts;
4476 endinpos = startinpos+1;
4477 if (unicode_decode_call_errorhandler(
4478 errors, &errorHandler,
4479 "charmap", "character maps to <undefined>",
4480 &starts, &e, &startinpos, &endinpos, &exc, &s,
4481 &v, &outpos, &p)) {
4482 goto onError;
4483 }
4484 continue;
4485 }
4486 *p++ = x;
4487 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004488 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004489 }
4490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 while (s < e) {
4492 unsigned char ch = *s;
4493 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004494
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4496 w = PyLong_FromLong((long)ch);
4497 if (w == NULL)
4498 goto onError;
4499 x = PyObject_GetItem(mapping, w);
4500 Py_DECREF(w);
4501 if (x == NULL) {
4502 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4503 /* No mapping found means: mapping is undefined. */
4504 PyErr_Clear();
4505 x = Py_None;
4506 Py_INCREF(x);
4507 } else
4508 goto onError;
4509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004510
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 /* Apply mapping */
4512 if (PyLong_Check(x)) {
4513 long value = PyLong_AS_LONG(x);
4514 if (value < 0 || value > 65535) {
4515 PyErr_SetString(PyExc_TypeError,
4516 "character mapping must be in range(65536)");
4517 Py_DECREF(x);
4518 goto onError;
4519 }
4520 *p++ = (Py_UNICODE)value;
4521 }
4522 else if (x == Py_None) {
4523 /* undefined mapping */
4524 outpos = p-PyUnicode_AS_UNICODE(v);
4525 startinpos = s-starts;
4526 endinpos = startinpos+1;
4527 if (unicode_decode_call_errorhandler(
4528 errors, &errorHandler,
4529 "charmap", "character maps to <undefined>",
4530 &starts, &e, &startinpos, &endinpos, &exc, &s,
4531 &v, &outpos, &p)) {
4532 Py_DECREF(x);
4533 goto onError;
4534 }
4535 Py_DECREF(x);
4536 continue;
4537 }
4538 else if (PyUnicode_Check(x)) {
4539 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004540
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 if (targetsize == 1)
4542 /* 1-1 mapping */
4543 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004544
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 else if (targetsize > 1) {
4546 /* 1-n mapping */
4547 if (targetsize > extrachars) {
4548 /* resize first */
4549 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4550 Py_ssize_t needed = (targetsize - extrachars) + \
4551 (targetsize << 2);
4552 extrachars += needed;
4553 /* XXX overflow detection missing */
4554 if (_PyUnicode_Resize(&v,
4555 PyUnicode_GET_SIZE(v) + needed) < 0) {
4556 Py_DECREF(x);
4557 goto onError;
4558 }
4559 p = PyUnicode_AS_UNICODE(v) + oldpos;
4560 }
4561 Py_UNICODE_COPY(p,
4562 PyUnicode_AS_UNICODE(x),
4563 targetsize);
4564 p += targetsize;
4565 extrachars -= targetsize;
4566 }
4567 /* 1-0 mapping: skip the character */
4568 }
4569 else {
4570 /* wrong return value */
4571 PyErr_SetString(PyExc_TypeError,
4572 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004573 Py_DECREF(x);
4574 goto onError;
4575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 Py_DECREF(x);
4577 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
4580 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_XDECREF(errorHandler);
4584 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004586
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 Py_XDECREF(v);
4591 return NULL;
4592}
4593
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004594/* Charmap encoding: the lookup table */
4595
4596struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 PyObject_HEAD
4598 unsigned char level1[32];
4599 int count2, count3;
4600 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601};
4602
4603static PyObject*
4604encoding_map_size(PyObject *obj, PyObject* args)
4605{
4606 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004607 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004609}
4610
4611static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004612 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 PyDoc_STR("Return the size (in bytes) of this object") },
4614 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615};
4616
4617static void
4618encoding_map_dealloc(PyObject* o)
4619{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004620 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004621}
4622
4623static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004624 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 "EncodingMap", /*tp_name*/
4626 sizeof(struct encoding_map), /*tp_basicsize*/
4627 0, /*tp_itemsize*/
4628 /* methods */
4629 encoding_map_dealloc, /*tp_dealloc*/
4630 0, /*tp_print*/
4631 0, /*tp_getattr*/
4632 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004633 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 0, /*tp_repr*/
4635 0, /*tp_as_number*/
4636 0, /*tp_as_sequence*/
4637 0, /*tp_as_mapping*/
4638 0, /*tp_hash*/
4639 0, /*tp_call*/
4640 0, /*tp_str*/
4641 0, /*tp_getattro*/
4642 0, /*tp_setattro*/
4643 0, /*tp_as_buffer*/
4644 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4645 0, /*tp_doc*/
4646 0, /*tp_traverse*/
4647 0, /*tp_clear*/
4648 0, /*tp_richcompare*/
4649 0, /*tp_weaklistoffset*/
4650 0, /*tp_iter*/
4651 0, /*tp_iternext*/
4652 encoding_map_methods, /*tp_methods*/
4653 0, /*tp_members*/
4654 0, /*tp_getset*/
4655 0, /*tp_base*/
4656 0, /*tp_dict*/
4657 0, /*tp_descr_get*/
4658 0, /*tp_descr_set*/
4659 0, /*tp_dictoffset*/
4660 0, /*tp_init*/
4661 0, /*tp_alloc*/
4662 0, /*tp_new*/
4663 0, /*tp_free*/
4664 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004665};
4666
4667PyObject*
4668PyUnicode_BuildEncodingMap(PyObject* string)
4669{
4670 Py_UNICODE *decode;
4671 PyObject *result;
4672 struct encoding_map *mresult;
4673 int i;
4674 int need_dict = 0;
4675 unsigned char level1[32];
4676 unsigned char level2[512];
4677 unsigned char *mlevel1, *mlevel2, *mlevel3;
4678 int count2 = 0, count3 = 0;
4679
4680 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4681 PyErr_BadArgument();
4682 return NULL;
4683 }
4684 decode = PyUnicode_AS_UNICODE(string);
4685 memset(level1, 0xFF, sizeof level1);
4686 memset(level2, 0xFF, sizeof level2);
4687
4688 /* If there isn't a one-to-one mapping of NULL to \0,
4689 or if there are non-BMP characters, we need to use
4690 a mapping dictionary. */
4691 if (decode[0] != 0)
4692 need_dict = 1;
4693 for (i = 1; i < 256; i++) {
4694 int l1, l2;
4695 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004696#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004698#endif
4699 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004700 need_dict = 1;
4701 break;
4702 }
4703 if (decode[i] == 0xFFFE)
4704 /* unmapped character */
4705 continue;
4706 l1 = decode[i] >> 11;
4707 l2 = decode[i] >> 7;
4708 if (level1[l1] == 0xFF)
4709 level1[l1] = count2++;
4710 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004711 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004712 }
4713
4714 if (count2 >= 0xFF || count3 >= 0xFF)
4715 need_dict = 1;
4716
4717 if (need_dict) {
4718 PyObject *result = PyDict_New();
4719 PyObject *key, *value;
4720 if (!result)
4721 return NULL;
4722 for (i = 0; i < 256; i++) {
4723 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004724 key = PyLong_FromLong(decode[i]);
4725 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004726 if (!key || !value)
4727 goto failed1;
4728 if (PyDict_SetItem(result, key, value) == -1)
4729 goto failed1;
4730 Py_DECREF(key);
4731 Py_DECREF(value);
4732 }
4733 return result;
4734 failed1:
4735 Py_XDECREF(key);
4736 Py_XDECREF(value);
4737 Py_DECREF(result);
4738 return NULL;
4739 }
4740
4741 /* Create a three-level trie */
4742 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4743 16*count2 + 128*count3 - 1);
4744 if (!result)
4745 return PyErr_NoMemory();
4746 PyObject_Init(result, &EncodingMapType);
4747 mresult = (struct encoding_map*)result;
4748 mresult->count2 = count2;
4749 mresult->count3 = count3;
4750 mlevel1 = mresult->level1;
4751 mlevel2 = mresult->level23;
4752 mlevel3 = mresult->level23 + 16*count2;
4753 memcpy(mlevel1, level1, 32);
4754 memset(mlevel2, 0xFF, 16*count2);
4755 memset(mlevel3, 0, 128*count3);
4756 count3 = 0;
4757 for (i = 1; i < 256; i++) {
4758 int o1, o2, o3, i2, i3;
4759 if (decode[i] == 0xFFFE)
4760 /* unmapped character */
4761 continue;
4762 o1 = decode[i]>>11;
4763 o2 = (decode[i]>>7) & 0xF;
4764 i2 = 16*mlevel1[o1] + o2;
4765 if (mlevel2[i2] == 0xFF)
4766 mlevel2[i2] = count3++;
4767 o3 = decode[i] & 0x7F;
4768 i3 = 128*mlevel2[i2] + o3;
4769 mlevel3[i3] = i;
4770 }
4771 return result;
4772}
4773
4774static int
4775encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4776{
4777 struct encoding_map *map = (struct encoding_map*)mapping;
4778 int l1 = c>>11;
4779 int l2 = (c>>7) & 0xF;
4780 int l3 = c & 0x7F;
4781 int i;
4782
4783#ifdef Py_UNICODE_WIDE
4784 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004786 }
4787#endif
4788 if (c == 0)
4789 return 0;
4790 /* level 1*/
4791 i = map->level1[l1];
4792 if (i == 0xFF) {
4793 return -1;
4794 }
4795 /* level 2*/
4796 i = map->level23[16*i+l2];
4797 if (i == 0xFF) {
4798 return -1;
4799 }
4800 /* level 3 */
4801 i = map->level23[16*map->count2 + 128*i + l3];
4802 if (i == 0) {
4803 return -1;
4804 }
4805 return i;
4806}
4807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808/* Lookup the character ch in the mapping. If the character
4809 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004810 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812{
Christian Heimes217cfd12007-12-02 14:31:20 +00004813 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 PyObject *x;
4815
4816 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 x = PyObject_GetItem(mapping, w);
4819 Py_DECREF(w);
4820 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4822 /* No mapping found means: mapping is undefined. */
4823 PyErr_Clear();
4824 x = Py_None;
4825 Py_INCREF(x);
4826 return x;
4827 } else
4828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004830 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004832 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 long value = PyLong_AS_LONG(x);
4834 if (value < 0 || value > 255) {
4835 PyErr_SetString(PyExc_TypeError,
4836 "character mapping must be in range(256)");
4837 Py_DECREF(x);
4838 return NULL;
4839 }
4840 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 /* wrong return value */
4846 PyErr_Format(PyExc_TypeError,
4847 "character mapping must return integer, bytes or None, not %.400s",
4848 x->ob_type->tp_name);
4849 Py_DECREF(x);
4850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 }
4852}
4853
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004854static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004855charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004857 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4858 /* exponentially overallocate to minimize reallocations */
4859 if (requiredsize < 2*outsize)
4860 requiredsize = 2*outsize;
4861 if (_PyBytes_Resize(outobj, requiredsize))
4862 return -1;
4863 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004864}
4865
Benjamin Peterson14339b62009-01-31 16:36:08 +00004866typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004868}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004870 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 space is available. Return a new reference to the object that
4872 was put in the output buffer, or Py_None, if the mapping was undefined
4873 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004874 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004876charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004879 PyObject *rep;
4880 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004881 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882
Christian Heimes90aa7642007-12-19 02:45:37 +00004883 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004884 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886 if (res == -1)
4887 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (outsize<requiredsize)
4889 if (charmapencode_resize(outobj, outpos, requiredsize))
4890 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004891 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 outstart[(*outpos)++] = (char)res;
4893 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004894 }
4895
4896 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004899 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 Py_DECREF(rep);
4901 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004902 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 if (PyLong_Check(rep)) {
4904 Py_ssize_t requiredsize = *outpos+1;
4905 if (outsize<requiredsize)
4906 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4907 Py_DECREF(rep);
4908 return enc_EXCEPTION;
4909 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004910 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 else {
4914 const char *repchars = PyBytes_AS_STRING(rep);
4915 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4916 Py_ssize_t requiredsize = *outpos+repsize;
4917 if (outsize<requiredsize)
4918 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4919 Py_DECREF(rep);
4920 return enc_EXCEPTION;
4921 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004922 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 memcpy(outstart + *outpos, repchars, repsize);
4924 *outpos += repsize;
4925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004927 Py_DECREF(rep);
4928 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929}
4930
4931/* handle an error in PyUnicode_EncodeCharmap
4932 Return 0 on success, -1 on error */
4933static
4934int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004937 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004938 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939{
4940 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 Py_ssize_t repsize;
4942 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943 Py_UNICODE *uni2;
4944 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004945 Py_ssize_t collstartpos = *inpos;
4946 Py_ssize_t collendpos = *inpos+1;
4947 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 char *encoding = "charmap";
4949 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004950 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 /* find all unencodable characters */
4953 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004954 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004955 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 int res = encoding_map_lookup(p[collendpos], mapping);
4957 if (res != -1)
4958 break;
4959 ++collendpos;
4960 continue;
4961 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004962
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 rep = charmapencode_lookup(p[collendpos], mapping);
4964 if (rep==NULL)
4965 return -1;
4966 else if (rep!=Py_None) {
4967 Py_DECREF(rep);
4968 break;
4969 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004970 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 }
4973 /* cache callback name lookup
4974 * (if not done yet, i.e. it's the first error) */
4975 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 if ((errors==NULL) || (!strcmp(errors, "strict")))
4977 *known_errorHandler = 1;
4978 else if (!strcmp(errors, "replace"))
4979 *known_errorHandler = 2;
4980 else if (!strcmp(errors, "ignore"))
4981 *known_errorHandler = 3;
4982 else if (!strcmp(errors, "xmlcharrefreplace"))
4983 *known_errorHandler = 4;
4984 else
4985 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 }
4987 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004988 case 1: /* strict */
4989 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4990 return -1;
4991 case 2: /* replace */
4992 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 x = charmapencode_output('?', mapping, res, respos);
4994 if (x==enc_EXCEPTION) {
4995 return -1;
4996 }
4997 else if (x==enc_FAILED) {
4998 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4999 return -1;
5000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005001 }
5002 /* fall through */
5003 case 3: /* ignore */
5004 *inpos = collendpos;
5005 break;
5006 case 4: /* xmlcharrefreplace */
5007 /* generate replacement (temporarily (mis)uses p) */
5008 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 char buffer[2+29+1+1];
5010 char *cp;
5011 sprintf(buffer, "&#%d;", (int)p[collpos]);
5012 for (cp = buffer; *cp; ++cp) {
5013 x = charmapencode_output(*cp, mapping, res, respos);
5014 if (x==enc_EXCEPTION)
5015 return -1;
5016 else if (x==enc_FAILED) {
5017 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5018 return -1;
5019 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005020 }
5021 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005022 *inpos = collendpos;
5023 break;
5024 default:
5025 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 encoding, reason, p, size, exceptionObject,
5027 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005028 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 return -1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005030 /* generate replacement */
5031 repsize = PyUnicode_GET_SIZE(repunicode);
5032 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 x = charmapencode_output(*uni2, mapping, res, respos);
5034 if (x==enc_EXCEPTION) {
5035 return -1;
5036 }
5037 else if (x==enc_FAILED) {
5038 Py_DECREF(repunicode);
5039 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5040 return -1;
5041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005042 }
5043 *inpos = newpos;
5044 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 }
5046 return 0;
5047}
5048
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 Py_ssize_t size,
5051 PyObject *mapping,
5052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054 /* output object */
5055 PyObject *res = NULL;
5056 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005057 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 PyObject *errorHandler = NULL;
5061 PyObject *exc = NULL;
5062 /* the following variable is used for caching string comparisons
5063 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5064 * 3=ignore, 4=xmlcharrefreplace */
5065 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
5067 /* Default to Latin-1 */
5068 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 /* allocate enough for a simple encoding without
5072 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005073 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 if (res == NULL)
5075 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005076 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 /* try to encode it */
5081 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5082 if (x==enc_EXCEPTION) /* error */
5083 goto onError;
5084 if (x==enc_FAILED) { /* unencodable character */
5085 if (charmap_encoding_error(p, size, &inpos, mapping,
5086 &exc,
5087 &known_errorHandler, &errorHandler, errors,
5088 &res, &respos)) {
5089 goto onError;
5090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 else
5093 /* done with this character => adjust input position */
5094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005098 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005099 if (_PyBytes_Resize(&res, respos) < 0)
5100 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 Py_XDECREF(exc);
5103 Py_XDECREF(errorHandler);
5104 return res;
5105
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 Py_XDECREF(res);
5108 Py_XDECREF(exc);
5109 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 return NULL;
5111}
5112
5113PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115{
5116 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 PyErr_BadArgument();
5118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
5120 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 PyUnicode_GET_SIZE(unicode),
5122 mapping,
5123 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124}
5125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126/* create or adjust a UnicodeTranslateError */
5127static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 const Py_UNICODE *unicode, Py_ssize_t size,
5129 Py_ssize_t startpos, Py_ssize_t endpos,
5130 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
5136 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5138 goto onError;
5139 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5140 goto onError;
5141 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5142 goto onError;
5143 return;
5144 onError:
5145 Py_DECREF(*exceptionObject);
5146 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 }
5148}
5149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150/* raises a UnicodeTranslateError */
5151static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 const Py_UNICODE *unicode, Py_ssize_t size,
5153 Py_ssize_t startpos, Py_ssize_t endpos,
5154 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155{
5156 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160}
5161
5162/* error handling callback helper:
5163 build arguments, call the callback and check the arguments,
5164 put the result into newpos and return the replacement string, which
5165 has to be freed by the caller */
5166static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 PyObject **errorHandler,
5168 const char *reason,
5169 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5170 Py_ssize_t startpos, Py_ssize_t endpos,
5171 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005173 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005175 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176 PyObject *restuple;
5177 PyObject *resunicode;
5178
5179 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 }
5184
5185 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189
5190 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005195 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_DECREF(restuple);
5197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 }
5199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 &resunicode, &i_newpos)) {
5201 Py_DECREF(restuple);
5202 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 else
5207 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005208 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5210 Py_DECREF(restuple);
5211 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005212 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 Py_INCREF(resunicode);
5214 Py_DECREF(restuple);
5215 return resunicode;
5216}
5217
5218/* Lookup the character ch in the mapping and put the result in result,
5219 which must be decrefed by the caller.
5220 Return 0 on success, -1 on error */
5221static
5222int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5223{
Christian Heimes217cfd12007-12-02 14:31:20 +00005224 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 PyObject *x;
5226
5227 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 x = PyObject_GetItem(mapping, w);
5230 Py_DECREF(w);
5231 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5233 /* No mapping found means: use 1:1 mapping. */
5234 PyErr_Clear();
5235 *result = NULL;
5236 return 0;
5237 } else
5238 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 }
5240 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 *result = x;
5242 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005244 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 long value = PyLong_AS_LONG(x);
5246 long max = PyUnicode_GetMax();
5247 if (value < 0 || value > max) {
5248 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005249 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 Py_DECREF(x);
5251 return -1;
5252 }
5253 *result = x;
5254 return 0;
5255 }
5256 else if (PyUnicode_Check(x)) {
5257 *result = x;
5258 return 0;
5259 }
5260 else {
5261 /* wrong return value */
5262 PyErr_SetString(PyExc_TypeError,
5263 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005264 Py_DECREF(x);
5265 return -1;
5266 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267}
5268/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 if not reallocate and adjust various state variables.
5270 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271static
Walter Dörwald4894c302003-10-24 14:25:28 +00005272int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005275 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005276 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 /* remember old output position */
5278 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5279 /* exponentially overallocate to minimize reallocations */
5280 if (requiredsize < 2 * oldsize)
5281 requiredsize = 2 * oldsize;
5282 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5283 return -1;
5284 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 }
5286 return 0;
5287}
5288/* lookup the character, put the result in the output string and adjust
5289 various state variables. Return a new reference to the object that
5290 was put in the output buffer in *result, or Py_None, if the mapping was
5291 undefined (in which case no character was written).
5292 The called must decref result.
5293 Return 0 on success, -1 on error. */
5294static
Walter Dörwald4894c302003-10-24 14:25:28 +00005295int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5297 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298{
Walter Dörwald4894c302003-10-24 14:25:28 +00005299 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 /* not found => default to 1:1 mapping */
5303 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 }
5305 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005307 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 /* no overflow check, because we know that the space is enough */
5309 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 }
5311 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5313 if (repsize==1) {
5314 /* no overflow check, because we know that the space is enough */
5315 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5316 }
5317 else if (repsize!=0) {
5318 /* more than one character */
5319 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5320 (insize - (curinp-startinp)) +
5321 repsize - 1;
5322 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5323 return -1;
5324 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5325 *outp += repsize;
5326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327 }
5328 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 return 0;
5331}
5332
5333PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 Py_ssize_t size,
5335 PyObject *mapping,
5336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 /* output object */
5339 PyObject *res = NULL;
5340 /* pointers to the beginning and end+1 of input */
5341 const Py_UNICODE *startp = p;
5342 const Py_UNICODE *endp = p + size;
5343 /* pointer into the output */
5344 Py_UNICODE *str;
5345 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 char *reason = "character maps to <undefined>";
5348 PyObject *errorHandler = NULL;
5349 PyObject *exc = NULL;
5350 /* the following variable is used for caching string comparisons
5351 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5352 * 3=ignore, 4=xmlcharrefreplace */
5353 int known_errorHandler = -1;
5354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 PyErr_BadArgument();
5357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359
5360 /* allocate enough for a simple 1:1 translation without
5361 replacements, if we need more, we'll resize */
5362 res = PyUnicode_FromUnicode(NULL, size);
5363 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 /* try to encode it */
5371 PyObject *x = NULL;
5372 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5373 Py_XDECREF(x);
5374 goto onError;
5375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005376 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 if (x!=Py_None) /* it worked => adjust input pointer */
5378 ++p;
5379 else { /* untranslatable character */
5380 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5381 Py_ssize_t repsize;
5382 Py_ssize_t newpos;
5383 Py_UNICODE *uni2;
5384 /* startpos for collecting untranslatable chars */
5385 const Py_UNICODE *collstart = p;
5386 const Py_UNICODE *collend = p+1;
5387 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 /* find all untranslatable characters */
5390 while (collend < endp) {
5391 if (charmaptranslate_lookup(*collend, mapping, &x))
5392 goto onError;
5393 Py_XDECREF(x);
5394 if (x!=Py_None)
5395 break;
5396 ++collend;
5397 }
5398 /* cache callback name lookup
5399 * (if not done yet, i.e. it's the first error) */
5400 if (known_errorHandler==-1) {
5401 if ((errors==NULL) || (!strcmp(errors, "strict")))
5402 known_errorHandler = 1;
5403 else if (!strcmp(errors, "replace"))
5404 known_errorHandler = 2;
5405 else if (!strcmp(errors, "ignore"))
5406 known_errorHandler = 3;
5407 else if (!strcmp(errors, "xmlcharrefreplace"))
5408 known_errorHandler = 4;
5409 else
5410 known_errorHandler = 0;
5411 }
5412 switch (known_errorHandler) {
5413 case 1: /* strict */
5414 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 case 2: /* replace */
5417 /* No need to check for space, this is a 1:1 replacement */
5418 for (coll = collstart; coll<collend; ++coll)
5419 *str++ = '?';
5420 /* fall through */
5421 case 3: /* ignore */
5422 p = collend;
5423 break;
5424 case 4: /* xmlcharrefreplace */
5425 /* generate replacement (temporarily (mis)uses p) */
5426 for (p = collstart; p < collend; ++p) {
5427 char buffer[2+29+1+1];
5428 char *cp;
5429 sprintf(buffer, "&#%d;", (int)*p);
5430 if (charmaptranslate_makespace(&res, &str,
5431 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5432 goto onError;
5433 for (cp = buffer; *cp; ++cp)
5434 *str++ = *cp;
5435 }
5436 p = collend;
5437 break;
5438 default:
5439 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5440 reason, startp, size, &exc,
5441 collstart-startp, collend-startp, &newpos);
5442 if (repunicode == NULL)
5443 goto onError;
5444 /* generate replacement */
5445 repsize = PyUnicode_GET_SIZE(repunicode);
5446 if (charmaptranslate_makespace(&res, &str,
5447 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5448 Py_DECREF(repunicode);
5449 goto onError;
5450 }
5451 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5452 *str++ = *uni2;
5453 p = startp + newpos;
5454 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005456 }
5457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 /* Resize if we allocated to much */
5459 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005460 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 if (PyUnicode_Resize(&res, respos) < 0)
5462 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 }
5464 Py_XDECREF(exc);
5465 Py_XDECREF(errorHandler);
5466 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 Py_XDECREF(res);
5470 Py_XDECREF(exc);
5471 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 return NULL;
5473}
5474
5475PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 PyObject *mapping,
5477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
5479 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 str = PyUnicode_FromObject(str);
5482 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 PyUnicode_GET_SIZE(str),
5486 mapping,
5487 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 Py_DECREF(str);
5489 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 Py_XDECREF(str);
5493 return NULL;
5494}
Tim Petersced69f82003-09-16 20:30:58 +00005495
Guido van Rossum9e896b32000-04-05 20:11:21 +00005496/* --- Decimal Encoder ---------------------------------------------------- */
5497
5498int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 Py_ssize_t length,
5500 char *output,
5501 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005502{
5503 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 PyObject *errorHandler = NULL;
5505 PyObject *exc = NULL;
5506 const char *encoding = "decimal";
5507 const char *reason = "invalid decimal Unicode string";
5508 /* the following variable is used for caching string comparisons
5509 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5510 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005511
5512 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 PyErr_BadArgument();
5514 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005515 }
5516
5517 p = s;
5518 end = s + length;
5519 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 register Py_UNICODE ch = *p;
5521 int decimal;
5522 PyObject *repunicode;
5523 Py_ssize_t repsize;
5524 Py_ssize_t newpos;
5525 Py_UNICODE *uni2;
5526 Py_UNICODE *collstart;
5527 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005528
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005530 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 ++p;
5532 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 decimal = Py_UNICODE_TODECIMAL(ch);
5535 if (decimal >= 0) {
5536 *output++ = '0' + decimal;
5537 ++p;
5538 continue;
5539 }
5540 if (0 < ch && ch < 256) {
5541 *output++ = (char)ch;
5542 ++p;
5543 continue;
5544 }
5545 /* All other characters are considered unencodable */
5546 collstart = p;
5547 collend = p+1;
5548 while (collend < end) {
5549 if ((0 < *collend && *collend < 256) ||
5550 !Py_UNICODE_ISSPACE(*collend) ||
5551 Py_UNICODE_TODECIMAL(*collend))
5552 break;
5553 }
5554 /* cache callback name lookup
5555 * (if not done yet, i.e. it's the first error) */
5556 if (known_errorHandler==-1) {
5557 if ((errors==NULL) || (!strcmp(errors, "strict")))
5558 known_errorHandler = 1;
5559 else if (!strcmp(errors, "replace"))
5560 known_errorHandler = 2;
5561 else if (!strcmp(errors, "ignore"))
5562 known_errorHandler = 3;
5563 else if (!strcmp(errors, "xmlcharrefreplace"))
5564 known_errorHandler = 4;
5565 else
5566 known_errorHandler = 0;
5567 }
5568 switch (known_errorHandler) {
5569 case 1: /* strict */
5570 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5571 goto onError;
5572 case 2: /* replace */
5573 for (p = collstart; p < collend; ++p)
5574 *output++ = '?';
5575 /* fall through */
5576 case 3: /* ignore */
5577 p = collend;
5578 break;
5579 case 4: /* xmlcharrefreplace */
5580 /* generate replacement (temporarily (mis)uses p) */
5581 for (p = collstart; p < collend; ++p)
5582 output += sprintf(output, "&#%d;", (int)*p);
5583 p = collend;
5584 break;
5585 default:
5586 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5587 encoding, reason, s, length, &exc,
5588 collstart-s, collend-s, &newpos);
5589 if (repunicode == NULL)
5590 goto onError;
5591 /* generate replacement */
5592 repsize = PyUnicode_GET_SIZE(repunicode);
5593 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5594 Py_UNICODE ch = *uni2;
5595 if (Py_UNICODE_ISSPACE(ch))
5596 *output++ = ' ';
5597 else {
5598 decimal = Py_UNICODE_TODECIMAL(ch);
5599 if (decimal >= 0)
5600 *output++ = '0' + decimal;
5601 else if (0 < ch && ch < 256)
5602 *output++ = (char)ch;
5603 else {
5604 Py_DECREF(repunicode);
5605 raise_encode_exception(&exc, encoding,
5606 s, length, collstart-s, collend-s, reason);
5607 goto onError;
5608 }
5609 }
5610 }
5611 p = s + newpos;
5612 Py_DECREF(repunicode);
5613 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005614 }
5615 /* 0-terminate the output string */
5616 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 Py_XDECREF(exc);
5618 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005619 return 0;
5620
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 Py_XDECREF(exc);
5623 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005624 return -1;
5625}
5626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627/* --- Helpers ------------------------------------------------------------ */
5628
Eric Smith8c663262007-08-25 02:26:07 +00005629#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005630#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005631#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005632/* Include _ParseTupleFinds from find.h */
5633#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005634#include "stringlib/find.h"
5635#include "stringlib/partition.h"
5636
Eric Smith5807c412008-05-11 21:00:57 +00005637#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5638#include "stringlib/localeutil.h"
5639
Thomas Wouters477c8d52006-05-27 19:21:47 +00005640/* helper macro to fixup start/end slice values */
5641#define FIX_START_END(obj) \
5642 if (start < 0) \
5643 start += (obj)->length; \
5644 if (start < 0) \
5645 start = 0; \
5646 if (end > (obj)->length) \
5647 end = (obj)->length; \
5648 if (end < 0) \
5649 end += (obj)->length; \
5650 if (end < 0) \
5651 end = 0;
5652
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005654 PyObject *substr,
5655 Py_ssize_t start,
5656 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005659 PyUnicodeObject* str_obj;
5660 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005661
Thomas Wouters477c8d52006-05-27 19:21:47 +00005662 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5663 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005665 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5666 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 Py_DECREF(str_obj);
5668 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 }
Tim Petersced69f82003-09-16 20:30:58 +00005670
Thomas Wouters477c8d52006-05-27 19:21:47 +00005671 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005672
Thomas Wouters477c8d52006-05-27 19:21:47 +00005673 result = stringlib_count(
5674 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5675 );
5676
5677 Py_DECREF(sub_obj);
5678 Py_DECREF(str_obj);
5679
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return result;
5681}
5682
Martin v. Löwis18e16552006-02-15 17:27:45 +00005683Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005684 PyObject *sub,
5685 Py_ssize_t start,
5686 Py_ssize_t end,
5687 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005689 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005692 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005694 sub = PyUnicode_FromObject(sub);
5695 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 Py_DECREF(str);
5697 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
Tim Petersced69f82003-09-16 20:30:58 +00005699
Thomas Wouters477c8d52006-05-27 19:21:47 +00005700 if (direction > 0)
5701 result = stringlib_find_slice(
5702 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5703 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5704 start, end
5705 );
5706 else
5707 result = stringlib_rfind_slice(
5708 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5709 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5710 start, end
5711 );
5712
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005714 Py_DECREF(sub);
5715
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 return result;
5717}
5718
Tim Petersced69f82003-09-16 20:30:58 +00005719static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 PyUnicodeObject *substring,
5722 Py_ssize_t start,
5723 Py_ssize_t end,
5724 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 if (substring->length == 0)
5727 return 1;
5728
Thomas Wouters477c8d52006-05-27 19:21:47 +00005729 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731 end -= substring->length;
5732 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
5735 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 if (Py_UNICODE_MATCH(self, end, substring))
5737 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 } else {
5739 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
5742
5743 return 0;
5744}
5745
Martin v. Löwis18e16552006-02-15 17:27:45 +00005746Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 PyObject *substr,
5748 Py_ssize_t start,
5749 Py_ssize_t end,
5750 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005753
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 str = PyUnicode_FromObject(str);
5755 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 substr = PyUnicode_FromObject(substr);
5758 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 Py_DECREF(str);
5760 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Tim Petersced69f82003-09-16 20:30:58 +00005762
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 (PyUnicodeObject *)substr,
5765 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 Py_DECREF(str);
5767 Py_DECREF(substr);
5768 return result;
5769}
5770
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771/* Apply fixfct filter to the Unicode object self and return a
5772 reference to the modified object */
5773
Tim Petersced69f82003-09-16 20:30:58 +00005774static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
5778
5779 PyUnicodeObject *u;
5780
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005781 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005784
5785 Py_UNICODE_COPY(u->str, self->str, self->length);
5786
Tim Peters7a29bd52001-09-12 03:03:31 +00005787 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 /* fixfct should return TRUE if it modified the buffer. If
5789 FALSE, return a reference to the original buffer instead
5790 (to save space, not time) */
5791 Py_INCREF(self);
5792 Py_DECREF(u);
5793 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 }
5795 return (PyObject*) u;
5796}
5797
Tim Petersced69f82003-09-16 20:30:58 +00005798static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799int fixupper(PyUnicodeObject *self)
5800{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005801 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 Py_UNICODE *s = self->str;
5803 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005804
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005807
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 ch = Py_UNICODE_TOUPPER(*s);
5809 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 *s = ch;
5812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 s++;
5814 }
5815
5816 return status;
5817}
5818
Tim Petersced69f82003-09-16 20:30:58 +00005819static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820int fixlower(PyUnicodeObject *self)
5821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 Py_UNICODE *s = self->str;
5824 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005825
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005828
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 ch = Py_UNICODE_TOLOWER(*s);
5830 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 *s = ch;
5833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 s++;
5835 }
5836
5837 return status;
5838}
5839
Tim Petersced69f82003-09-16 20:30:58 +00005840static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841int fixswapcase(PyUnicodeObject *self)
5842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005843 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_UNICODE *s = self->str;
5845 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005846
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 while (len-- > 0) {
5848 if (Py_UNICODE_ISUPPER(*s)) {
5849 *s = Py_UNICODE_TOLOWER(*s);
5850 status = 1;
5851 } else if (Py_UNICODE_ISLOWER(*s)) {
5852 *s = Py_UNICODE_TOUPPER(*s);
5853 status = 1;
5854 }
5855 s++;
5856 }
5857
5858 return status;
5859}
5860
Tim Petersced69f82003-09-16 20:30:58 +00005861static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862int fixcapitalize(PyUnicodeObject *self)
5863{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005865 Py_UNICODE *s = self->str;
5866 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005867
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005868 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005870 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 *s = Py_UNICODE_TOUPPER(*s);
5872 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005874 s++;
5875 while (--len > 0) {
5876 if (Py_UNICODE_ISUPPER(*s)) {
5877 *s = Py_UNICODE_TOLOWER(*s);
5878 status = 1;
5879 }
5880 s++;
5881 }
5882 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885static
5886int fixtitle(PyUnicodeObject *self)
5887{
5888 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5889 register Py_UNICODE *e;
5890 int previous_is_cased;
5891
5892 /* Shortcut for single character strings */
5893 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5895 if (*p != ch) {
5896 *p = ch;
5897 return 1;
5898 }
5899 else
5900 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 }
Tim Petersced69f82003-09-16 20:30:58 +00005902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 e = p + PyUnicode_GET_SIZE(self);
5904 previous_is_cased = 0;
5905 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005907
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 if (previous_is_cased)
5909 *p = Py_UNICODE_TOLOWER(ch);
5910 else
5911 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 if (Py_UNICODE_ISLOWER(ch) ||
5914 Py_UNICODE_ISUPPER(ch) ||
5915 Py_UNICODE_ISTITLE(ch))
5916 previous_is_cased = 1;
5917 else
5918 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
5920 return 1;
5921}
5922
Tim Peters8ce9f162004-08-27 01:49:32 +00005923PyObject *
5924PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925{
Skip Montanaro6543b452004-09-16 03:28:13 +00005926 const Py_UNICODE blank = ' ';
5927 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005928 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005929 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005930 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5931 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005932 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5933 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005934 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005935 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
Tim Peters05eba1f2004-08-27 21:32:02 +00005937 fseq = PySequence_Fast(seq, "");
5938 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005939 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005940 }
5941
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005942 /* NOTE: the following code can't call back into Python code,
5943 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005944 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005945
Tim Peters05eba1f2004-08-27 21:32:02 +00005946 seqlen = PySequence_Fast_GET_SIZE(fseq);
5947 /* If empty sequence, return u"". */
5948 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005949 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5950 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005951 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005952 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005953 /* If singleton sequence with an exact Unicode, return that. */
5954 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 item = items[0];
5956 if (PyUnicode_CheckExact(item)) {
5957 Py_INCREF(item);
5958 res = (PyUnicodeObject *)item;
5959 goto Done;
5960 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005961 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005962 else {
5963 /* Set up sep and seplen */
5964 if (separator == NULL) {
5965 sep = &blank;
5966 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005967 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005968 else {
5969 if (!PyUnicode_Check(separator)) {
5970 PyErr_Format(PyExc_TypeError,
5971 "separator: expected str instance,"
5972 " %.80s found",
5973 Py_TYPE(separator)->tp_name);
5974 goto onError;
5975 }
5976 sep = PyUnicode_AS_UNICODE(separator);
5977 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005978 }
5979 }
5980
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005981 /* There are at least two things to join, or else we have a subclass
5982 * of str in the sequence.
5983 * Do a pre-pass to figure out the total amount of space we'll
5984 * need (sz), and see whether all argument are strings.
5985 */
5986 sz = 0;
5987 for (i = 0; i < seqlen; i++) {
5988 const Py_ssize_t old_sz = sz;
5989 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 if (!PyUnicode_Check(item)) {
5991 PyErr_Format(PyExc_TypeError,
5992 "sequence item %zd: expected str instance,"
5993 " %.80s found",
5994 i, Py_TYPE(item)->tp_name);
5995 goto onError;
5996 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005997 sz += PyUnicode_GET_SIZE(item);
5998 if (i != 0)
5999 sz += seplen;
6000 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6001 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006003 goto onError;
6004 }
6005 }
Tim Petersced69f82003-09-16 20:30:58 +00006006
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006007 res = _PyUnicode_New(sz);
6008 if (res == NULL)
6009 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006010
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006011 /* Catenate everything. */
6012 res_p = PyUnicode_AS_UNICODE(res);
6013 for (i = 0; i < seqlen; ++i) {
6014 Py_ssize_t itemlen;
6015 item = items[i];
6016 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* Copy item, and maybe the separator. */
6018 if (i) {
6019 Py_UNICODE_COPY(res_p, sep, seplen);
6020 res_p += seplen;
6021 }
6022 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6023 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006024 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006027 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 return (PyObject *)res;
6029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006031 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006032 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
6034}
6035
Tim Petersced69f82003-09-16 20:30:58 +00006036static
6037PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 Py_ssize_t left,
6039 Py_ssize_t right,
6040 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041{
6042 PyUnicodeObject *u;
6043
6044 if (left < 0)
6045 left = 0;
6046 if (right < 0)
6047 right = 0;
6048
Tim Peters7a29bd52001-09-12 03:03:31 +00006049 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 Py_INCREF(self);
6051 return self;
6052 }
6053
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006054 if (left > PY_SSIZE_T_MAX - self->length ||
6055 right > PY_SSIZE_T_MAX - (left + self->length)) {
6056 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6057 return NULL;
6058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 u = _PyUnicode_New(left + self->length + right);
6060 if (u) {
6061 if (left)
6062 Py_UNICODE_FILL(u->str, fill, left);
6063 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6064 if (right)
6065 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6066 }
6067
6068 return u;
6069}
6070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071#define SPLIT_APPEND(data, left, right) \
6072 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6073 if (!str) \
6074 goto onError; \
6075 if (PyList_Append(list, str)) { \
6076 Py_DECREF(str); \
6077 goto onError; \
6078 } \
6079 else \
6080 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
6082static
6083PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 PyObject *list,
6085 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 register Py_ssize_t i;
6088 register Py_ssize_t j;
6089 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006091 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092
6093 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006095 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006097 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6099 i++;
6100 if (j < i) {
6101 if (maxcount-- <= 0)
6102 break;
6103 SPLIT_APPEND(buf, j, i);
6104 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6105 i++;
6106 j = i;
6107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
6109 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 }
6112 return list;
6113
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 Py_DECREF(list);
6116 return NULL;
6117}
6118
6119PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122 register Py_ssize_t i;
6123 register Py_ssize_t j;
6124 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 PyObject *list;
6126 PyObject *str;
6127 Py_UNICODE *data;
6128
6129 string = PyUnicode_FromObject(string);
6130 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 data = PyUnicode_AS_UNICODE(string);
6133 len = PyUnicode_GET_SIZE(string);
6134
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 list = PyList_New(0);
6136 if (!list)
6137 goto onError;
6138
6139 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006141
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 /* Find a line and append it */
6143 while (i < len && !BLOOM_LINEBREAK(data[i]))
6144 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006147 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 if (i < len) {
6149 if (data[i] == '\r' && i + 1 < len &&
6150 data[i+1] == '\n')
6151 i += 2;
6152 else
6153 i++;
6154 if (keepends)
6155 eol = i;
6156 }
6157 SPLIT_APPEND(data, j, eol);
6158 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
6160 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
6163
6164 Py_DECREF(string);
6165 return list;
6166
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006168 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 Py_DECREF(string);
6170 return NULL;
6171}
6172
Tim Petersced69f82003-09-16 20:30:58 +00006173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 PyObject *list,
6176 Py_UNICODE ch,
6177 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006179 register Py_ssize_t i;
6180 register Py_ssize_t j;
6181 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006183 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 if (buf[i] == ch) {
6187 if (maxcount-- <= 0)
6188 break;
6189 SPLIT_APPEND(buf, j, i);
6190 i = j = i + 1;
6191 } else
6192 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 }
6194 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 }
6197 return list;
6198
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 Py_DECREF(list);
6201 return NULL;
6202}
6203
Tim Petersced69f82003-09-16 20:30:58 +00006204static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 PyObject *list,
6207 PyUnicodeObject *substring,
6208 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006210 register Py_ssize_t i;
6211 register Py_ssize_t j;
6212 Py_ssize_t len = self->length;
6213 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 PyObject *str;
6215
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006216 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 if (Py_UNICODE_MATCH(self, i, substring)) {
6218 if (maxcount-- <= 0)
6219 break;
6220 SPLIT_APPEND(self->str, j, i);
6221 i = j = i + sublen;
6222 } else
6223 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 }
6225 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 }
6228 return list;
6229
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 Py_DECREF(list);
6232 return NULL;
6233}
6234
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006235static
6236PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 PyObject *list,
6238 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006240 register Py_ssize_t i;
6241 register Py_ssize_t j;
6242 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006243 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006244 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006245
6246 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006248 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006250 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6252 i--;
6253 if (j > i) {
6254 if (maxcount-- <= 0)
6255 break;
6256 SPLIT_APPEND(buf, i + 1, j + 1);
6257 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6258 i--;
6259 j = i;
6260 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006261 }
6262 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006265 if (PyList_Reverse(list) < 0)
6266 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006267 return list;
6268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006270 Py_DECREF(list);
6271 return NULL;
6272}
6273
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006275PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 PyObject *list,
6277 Py_UNICODE ch,
6278 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006279{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006280 register Py_ssize_t i;
6281 register Py_ssize_t j;
6282 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006283 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006284 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006285
6286 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 if (buf[i] == ch) {
6288 if (maxcount-- <= 0)
6289 break;
6290 SPLIT_APPEND(buf, i + 1, j + 1);
6291 j = i = i - 1;
6292 } else
6293 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006294 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006295 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006297 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006298 if (PyList_Reverse(list) < 0)
6299 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006300 return list;
6301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006303 Py_DECREF(list);
6304 return NULL;
6305}
6306
Benjamin Peterson14339b62009-01-31 16:36:08 +00006307static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006308PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 PyObject *list,
6310 PyUnicodeObject *substring,
6311 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006312{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006313 register Py_ssize_t i;
6314 register Py_ssize_t j;
6315 Py_ssize_t len = self->length;
6316 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006317 PyObject *str;
6318
6319 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 if (Py_UNICODE_MATCH(self, i, substring)) {
6321 if (maxcount-- <= 0)
6322 break;
6323 SPLIT_APPEND(self->str, i + sublen, j);
6324 j = i;
6325 i -= sublen;
6326 } else
6327 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006328 }
6329 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006331 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006332 if (PyList_Reverse(list) < 0)
6333 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006334 return list;
6335
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006337 Py_DECREF(list);
6338 return NULL;
6339}
6340
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341#undef SPLIT_APPEND
6342
6343static
6344PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 PyUnicodeObject *substring,
6346 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
6348 PyObject *list;
6349
6350 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006351 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352
6353 list = PyList_New(0);
6354 if (!list)
6355 return NULL;
6356
6357 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359
6360 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362
6363 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 Py_DECREF(list);
6365 PyErr_SetString(PyExc_ValueError, "empty separator");
6366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 }
6368 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370}
6371
Tim Petersced69f82003-09-16 20:30:58 +00006372static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 PyUnicodeObject *substring,
6375 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006376{
6377 PyObject *list;
6378
6379 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006380 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006381
6382 list = PyList_New(0);
6383 if (!list)
6384 return NULL;
6385
6386 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006388
6389 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006391
6392 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 Py_DECREF(list);
6394 PyErr_SetString(PyExc_ValueError, "empty separator");
6395 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006396 }
6397 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006399}
6400
6401static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 PyUnicodeObject *str1,
6404 PyUnicodeObject *str2,
6405 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
6407 PyUnicodeObject *u;
6408
6409 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411
Thomas Wouters477c8d52006-05-27 19:21:47 +00006412 if (str1->length == str2->length) {
6413 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006414 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006415 if (str1->length == 1) {
6416 /* replace characters */
6417 Py_UNICODE u1, u2;
6418 if (!findchar(self->str, self->length, str1->str[0]))
6419 goto nothing;
6420 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6421 if (!u)
6422 return NULL;
6423 Py_UNICODE_COPY(u->str, self->str, self->length);
6424 u1 = str1->str[0];
6425 u2 = str2->str[0];
6426 for (i = 0; i < u->length; i++)
6427 if (u->str[i] == u1) {
6428 if (--maxcount < 0)
6429 break;
6430 u->str[i] = u2;
6431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006433 i = fastsearch(
6434 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 if (i < 0)
6437 goto nothing;
6438 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6439 if (!u)
6440 return NULL;
6441 Py_UNICODE_COPY(u->str, self->str, self->length);
6442 while (i <= self->length - str1->length)
6443 if (Py_UNICODE_MATCH(self, i, str1)) {
6444 if (--maxcount < 0)
6445 break;
6446 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6447 i += str1->length;
6448 } else
6449 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452
6453 Py_ssize_t n, i, j, e;
6454 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 Py_UNICODE *p;
6456
6457 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006458 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 if (n > maxcount)
6460 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 if (n == 0)
6462 goto nothing;
6463 /* new_size = self->length + n * (str2->length - str1->length)); */
6464 delta = (str2->length - str1->length);
6465 if (delta == 0) {
6466 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006468 product = n * (str2->length - str1->length);
6469 if ((product / (str2->length - str1->length)) != n) {
6470 PyErr_SetString(PyExc_OverflowError,
6471 "replace string is too long");
6472 return NULL;
6473 }
6474 new_size = self->length + product;
6475 if (new_size < 0) {
6476 PyErr_SetString(PyExc_OverflowError,
6477 "replace string is too long");
6478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
6480 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006481 u = _PyUnicode_New(new_size);
6482 if (!u)
6483 return NULL;
6484 i = 0;
6485 p = u->str;
6486 e = self->length - str1->length;
6487 if (str1->length > 0) {
6488 while (n-- > 0) {
6489 /* look for next match */
6490 j = i;
6491 while (j <= e) {
6492 if (Py_UNICODE_MATCH(self, j, str1))
6493 break;
6494 j++;
6495 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 if (j > e)
6498 break;
6499 /* copy unchanged part [i:j] */
6500 Py_UNICODE_COPY(p, self->str+i, j-i);
6501 p += j - i;
6502 }
6503 /* copy substitution string */
6504 if (str2->length > 0) {
6505 Py_UNICODE_COPY(p, str2->str, str2->length);
6506 p += str2->length;
6507 }
6508 i = j + str1->length;
6509 }
6510 if (i < self->length)
6511 /* copy tail [i:] */
6512 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6513 } else {
6514 /* interleave */
6515 while (n > 0) {
6516 Py_UNICODE_COPY(p, str2->str, str2->length);
6517 p += str2->length;
6518 if (--n <= 0)
6519 break;
6520 *p++ = self->str[i++];
6521 }
6522 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006528 /* nothing to replace; return original string (when possible) */
6529 if (PyUnicode_CheckExact(self)) {
6530 Py_INCREF(self);
6531 return (PyObject *) self;
6532 }
6533 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534}
6535
6536/* --- Unicode Object Methods --------------------------------------------- */
6537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540\n\
6541Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006542characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543
6544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006545unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return fixup(self, fixtitle);
6548}
6549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006550PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552\n\
6553Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
6556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006557unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 return fixup(self, fixcapitalize);
6560}
6561
6562#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565\n\
6566Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006570unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
6572 PyObject *list;
6573 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006574 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 /* Split into words */
6577 list = split(self, NULL, -1);
6578 if (!list)
6579 return NULL;
6580
6581 /* Capitalize each word */
6582 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6583 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 if (item == NULL)
6586 goto onError;
6587 Py_DECREF(PyList_GET_ITEM(list, i));
6588 PyList_SET_ITEM(list, i, item);
6589 }
6590
6591 /* Join the words to form a new string */
6592 item = PyUnicode_Join(NULL, list);
6593
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 Py_DECREF(list);
6596 return (PyObject *)item;
6597}
6598#endif
6599
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006600/* Argument converter. Coerces to a single unicode character */
6601
6602static int
6603convert_uc(PyObject *obj, void *addr)
6604{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006605 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6606 PyObject *uniobj;
6607 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006608
Benjamin Peterson14339b62009-01-31 16:36:08 +00006609 uniobj = PyUnicode_FromObject(obj);
6610 if (uniobj == NULL) {
6611 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006613 return 0;
6614 }
6615 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6616 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 Py_DECREF(uniobj);
6619 return 0;
6620 }
6621 unistr = PyUnicode_AS_UNICODE(uniobj);
6622 *fillcharloc = unistr[0];
6623 Py_DECREF(uniobj);
6624 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006625}
6626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006630Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006631done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
6633static PyObject *
6634unicode_center(PyUnicodeObject *self, PyObject *args)
6635{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636 Py_ssize_t marg, left;
6637 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006638 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
Thomas Woutersde017742006-02-16 19:34:37 +00006640 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 return NULL;
6642
Tim Peters7a29bd52001-09-12 03:03:31 +00006643 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 Py_INCREF(self);
6645 return (PyObject*) self;
6646 }
6647
6648 marg = width - self->length;
6649 left = marg / 2 + (marg & width & 1);
6650
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006651 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
Marc-André Lemburge5034372000-08-08 08:04:29 +00006654#if 0
6655
6656/* This code should go into some future Unicode collation support
6657 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006658 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006659
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006660/* speedy UTF-16 code point order comparison */
6661/* gleaned from: */
6662/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6663
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006664static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006665{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006666 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006667 0, 0, 0, 0, 0, 0, 0, 0,
6668 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006669 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006670};
6671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672static int
6673unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 Py_UNICODE *s1 = str1->str;
6678 Py_UNICODE *s2 = str2->str;
6679
6680 len1 = str1->length;
6681 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006684 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006685
6686 c1 = *s1++;
6687 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006688
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 if (c1 > (1<<11) * 26)
6690 c1 += utf16Fixup[c1>>11];
6691 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006692 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006693 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006694
6695 if (c1 != c2)
6696 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006698 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 }
6700
6701 return (len1 < len2) ? -1 : (len1 != len2);
6702}
6703
Marc-André Lemburge5034372000-08-08 08:04:29 +00006704#else
6705
6706static int
6707unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006710
6711 Py_UNICODE *s1 = str1->str;
6712 Py_UNICODE *s2 = str2->str;
6713
6714 len1 = str1->length;
6715 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006716
Marc-André Lemburge5034372000-08-08 08:04:29 +00006717 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006718 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006719
Fredrik Lundh45714e92001-06-26 16:39:36 +00006720 c1 = *s1++;
6721 c2 = *s2++;
6722
6723 if (c1 != c2)
6724 return (c1 < c2) ? -1 : 1;
6725
Marc-André Lemburge5034372000-08-08 08:04:29 +00006726 len1--; len2--;
6727 }
6728
6729 return (len1 < len2) ? -1 : (len1 != len2);
6730}
6731
6732#endif
6733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006737 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6738 return unicode_compare((PyUnicodeObject *)left,
6739 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006740 PyErr_Format(PyExc_TypeError,
6741 "Can't compare %.100s and %.100s",
6742 left->ob_type->tp_name,
6743 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 return -1;
6745}
6746
Martin v. Löwis5b222132007-06-10 09:51:05 +00006747int
6748PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6749{
6750 int i;
6751 Py_UNICODE *id;
6752 assert(PyUnicode_Check(uni));
6753 id = PyUnicode_AS_UNICODE(uni);
6754 /* Compare Unicode string and source character set string */
6755 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 if (id[i] != str[i])
6757 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006758 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006760 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006762 return 0;
6763}
6764
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006765
Benjamin Peterson29060642009-01-31 22:14:21 +00006766#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006767 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006768
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006769PyObject *PyUnicode_RichCompare(PyObject *left,
6770 PyObject *right,
6771 int op)
6772{
6773 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006774
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006775 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6776 PyObject *v;
6777 if (((PyUnicodeObject *) left)->length !=
6778 ((PyUnicodeObject *) right)->length) {
6779 if (op == Py_EQ) {
6780 Py_INCREF(Py_False);
6781 return Py_False;
6782 }
6783 if (op == Py_NE) {
6784 Py_INCREF(Py_True);
6785 return Py_True;
6786 }
6787 }
6788 if (left == right)
6789 result = 0;
6790 else
6791 result = unicode_compare((PyUnicodeObject *)left,
6792 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006793
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006794 /* Convert the return value to a Boolean */
6795 switch (op) {
6796 case Py_EQ:
6797 v = TEST_COND(result == 0);
6798 break;
6799 case Py_NE:
6800 v = TEST_COND(result != 0);
6801 break;
6802 case Py_LE:
6803 v = TEST_COND(result <= 0);
6804 break;
6805 case Py_GE:
6806 v = TEST_COND(result >= 0);
6807 break;
6808 case Py_LT:
6809 v = TEST_COND(result == -1);
6810 break;
6811 case Py_GT:
6812 v = TEST_COND(result == 1);
6813 break;
6814 default:
6815 PyErr_BadArgument();
6816 return NULL;
6817 }
6818 Py_INCREF(v);
6819 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006820 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006821
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006822 Py_INCREF(Py_NotImplemented);
6823 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006824}
6825
Guido van Rossum403d68b2000-03-13 15:55:09 +00006826int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006828{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006829 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006830 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006831
6832 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006833 sub = PyUnicode_FromObject(element);
6834 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 PyErr_Format(PyExc_TypeError,
6836 "'in <string>' requires string as left operand, not %s",
6837 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006838 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006839 }
6840
Thomas Wouters477c8d52006-05-27 19:21:47 +00006841 str = PyUnicode_FromObject(container);
6842 if (!str) {
6843 Py_DECREF(sub);
6844 return -1;
6845 }
6846
6847 result = stringlib_contains_obj(str, sub);
6848
6849 Py_DECREF(str);
6850 Py_DECREF(sub);
6851
Guido van Rossum403d68b2000-03-13 15:55:09 +00006852 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006853}
6854
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855/* Concat to string or Unicode object giving a new Unicode object. */
6856
6857PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859{
6860 PyUnicodeObject *u = NULL, *v = NULL, *w;
6861
6862 /* Coerce the two arguments */
6863 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6864 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870 /* Shortcuts */
6871 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Py_DECREF(v);
6873 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 }
6875 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 Py_DECREF(u);
6877 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 }
6879
6880 /* Concat the two Unicode strings */
6881 w = _PyUnicode_New(u->length + v->length);
6882 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 Py_UNICODE_COPY(w->str, u->str, u->length);
6885 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6886
6887 Py_DECREF(u);
6888 Py_DECREF(v);
6889 return (PyObject *)w;
6890
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 Py_XDECREF(u);
6893 Py_XDECREF(v);
6894 return NULL;
6895}
6896
Walter Dörwald1ab83302007-05-18 17:15:44 +00006897void
6898PyUnicode_Append(PyObject **pleft, PyObject *right)
6899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006900 PyObject *new;
6901 if (*pleft == NULL)
6902 return;
6903 if (right == NULL || !PyUnicode_Check(*pleft)) {
6904 Py_DECREF(*pleft);
6905 *pleft = NULL;
6906 return;
6907 }
6908 new = PyUnicode_Concat(*pleft, right);
6909 Py_DECREF(*pleft);
6910 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006911}
6912
6913void
6914PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6915{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006916 PyUnicode_Append(pleft, right);
6917 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006918}
6919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006923Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006924string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927static PyObject *
6928unicode_count(PyUnicodeObject *self, PyObject *args)
6929{
6930 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006932 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 PyObject *result;
6934
Guido van Rossumb8872e62000-05-09 14:14:27 +00006935 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 return NULL;
6938
6939 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006940 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006943
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Christian Heimes217cfd12007-12-02 14:31:20 +00006946 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006947 stringlib_count(self->str + start, end - start,
6948 substring->str, substring->length)
6949 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950
6951 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006952
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 return result;
6954}
6955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006959Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006960to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006961handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6963'xmlcharrefreplace' as well as any other name registered with\n\
6964codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject *
6967unicode_encode(PyUnicodeObject *self, PyObject *args)
6968{
6969 char *encoding = NULL;
6970 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006971 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006972
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6974 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006975 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006976 if (v == NULL)
6977 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006978 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006979 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006980 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006981 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006982 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006983 Py_DECREF(v);
6984 return NULL;
6985 }
6986 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006987
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006989 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006990}
6991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994\n\
6995Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
6998static PyObject*
6999unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7000{
7001 Py_UNICODE *e;
7002 Py_UNICODE *p;
7003 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007004 Py_UNICODE *qe;
7005 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 PyUnicodeObject *u;
7007 int tabsize = 8;
7008
7009 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
Thomas Wouters7e474022000-07-16 12:04:32 +00007012 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007013 i = 0; /* chars up to and including most recent \n or \r */
7014 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7015 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 for (p = self->str; p < e; p++)
7017 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 if (tabsize > 0) {
7019 incr = tabsize - (j % tabsize); /* cannot overflow */
7020 if (j > PY_SSIZE_T_MAX - incr)
7021 goto overflow1;
7022 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007023 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 if (j > PY_SSIZE_T_MAX - 1)
7027 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 j++;
7029 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 if (i > PY_SSIZE_T_MAX - j)
7031 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007033 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
7035 }
7036
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007037 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007039
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 /* Second pass: create output string and fill it */
7041 u = _PyUnicode_New(i + j);
7042 if (!u)
7043 return NULL;
7044
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007045 j = 0; /* same as in first pass */
7046 q = u->str; /* next output char */
7047 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049 for (p = self->str; p < e; p++)
7050 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 if (tabsize > 0) {
7052 i = tabsize - (j % tabsize);
7053 j += i;
7054 while (i--) {
7055 if (q >= qe)
7056 goto overflow2;
7057 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007060 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 else {
7062 if (q >= qe)
7063 goto overflow2;
7064 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007065 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 if (*p == '\n' || *p == '\r')
7067 j = 0;
7068 }
7069
7070 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007071
7072 overflow2:
7073 Py_DECREF(u);
7074 overflow1:
7075 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081\n\
7082Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007083such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084arguments start and end are interpreted as in slice notation.\n\
7085\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
7088static PyObject *
7089unicode_find(PyUnicodeObject *self, PyObject *args)
7090{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007091 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007092 Py_ssize_t start;
7093 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007094 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
Christian Heimes9cd17752007-11-18 19:35:23 +00007096 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098
Thomas Wouters477c8d52006-05-27 19:21:47 +00007099 result = stringlib_find_slice(
7100 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7101 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7102 start, end
7103 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007106
Christian Heimes217cfd12007-12-02 14:31:20 +00007107 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108}
7109
7110static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
7113 if (index < 0 || index >= self->length) {
7114 PyErr_SetString(PyExc_IndexError, "string index out of range");
7115 return NULL;
7116 }
7117
7118 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7119}
7120
Guido van Rossumc2504932007-09-18 19:42:40 +00007121/* Believe it or not, this produces the same value for ASCII strings
7122 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007124unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125{
Guido van Rossumc2504932007-09-18 19:42:40 +00007126 Py_ssize_t len;
7127 Py_UNICODE *p;
7128 long x;
7129
7130 if (self->hash != -1)
7131 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007132 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007133 p = self->str;
7134 x = *p << 7;
7135 while (--len >= 0)
7136 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007137 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007138 if (x == -1)
7139 x = -2;
7140 self->hash = x;
7141 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142}
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007147Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149static PyObject *
7150unicode_index(PyUnicodeObject *self, PyObject *args)
7151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007154 Py_ssize_t start;
7155 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
Christian Heimes9cd17752007-11-18 19:35:23 +00007157 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 result = stringlib_find_slice(
7161 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7162 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7163 start, end
7164 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
7166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (result < 0) {
7169 PyErr_SetString(PyExc_ValueError, "substring not found");
7170 return NULL;
7171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172
Christian Heimes217cfd12007-12-02 14:31:20 +00007173 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007183unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184{
7185 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7186 register const Py_UNICODE *e;
7187 int cased;
7188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 /* Shortcut for single character strings */
7190 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007193 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007194 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007196
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 e = p + PyUnicode_GET_SIZE(self);
7198 cased = 0;
7199 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007201
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7203 return PyBool_FromLong(0);
7204 else if (!cased && Py_UNICODE_ISLOWER(ch))
7205 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007207 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007213Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007214at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007217unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218{
7219 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7220 register const Py_UNICODE *e;
7221 int cased;
7222
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 /* Shortcut for single character strings */
7224 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007227 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007228 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 e = p + PyUnicode_GET_SIZE(self);
7232 cased = 0;
7233 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7237 return PyBool_FromLong(0);
7238 else if (!cased && Py_UNICODE_ISUPPER(ch))
7239 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007241 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242}
7243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007247Return True if S is a titlecased string and there is at least one\n\
7248character in S, i.e. upper- and titlecase characters may only\n\
7249follow uncased characters and lowercase characters only cased ones.\n\
7250Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007253unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7256 register const Py_UNICODE *e;
7257 int cased, previous_is_cased;
7258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 /* Shortcut for single character strings */
7260 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7262 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007264 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007265 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 e = p + PyUnicode_GET_SIZE(self);
7269 cased = 0;
7270 previous_is_cased = 0;
7271 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007273
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7275 if (previous_is_cased)
7276 return PyBool_FromLong(0);
7277 previous_is_cased = 1;
7278 cased = 1;
7279 }
7280 else if (Py_UNICODE_ISLOWER(ch)) {
7281 if (!previous_is_cased)
7282 return PyBool_FromLong(0);
7283 previous_is_cased = 1;
7284 cased = 1;
7285 }
7286 else
7287 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007289 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007295Return True if all characters in S are whitespace\n\
7296and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007299unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
7301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7302 register const Py_UNICODE *e;
7303
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 /* Shortcut for single character strings */
7305 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 Py_UNICODE_ISSPACE(*p))
7307 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007309 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007310 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 e = p + PyUnicode_GET_SIZE(self);
7314 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 if (!Py_UNICODE_ISSPACE(*p))
7316 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007318 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319}
7320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007323\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007324Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007326
7327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007328unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007329{
7330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7331 register const Py_UNICODE *e;
7332
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007333 /* Shortcut for single character strings */
7334 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 Py_UNICODE_ISALPHA(*p))
7336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007337
7338 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007339 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007341
7342 e = p + PyUnicode_GET_SIZE(self);
7343 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 if (!Py_UNICODE_ISALPHA(*p))
7345 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007346 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007347 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007348}
7349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007350PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007352\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007353Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007355
7356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007357unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007358{
7359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7360 register const Py_UNICODE *e;
7361
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362 /* Shortcut for single character strings */
7363 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 Py_UNICODE_ISALNUM(*p))
7365 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366
7367 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007368 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007370
7371 e = p + PyUnicode_GET_SIZE(self);
7372 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 if (!Py_UNICODE_ISALNUM(*p))
7374 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007376 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007377}
7378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007379PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007382Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
7385static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007386unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387{
7388 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7389 register const Py_UNICODE *e;
7390
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 /* Shortcut for single character strings */
7392 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 Py_UNICODE_ISDECIMAL(*p))
7394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007396 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007397 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007399
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 e = p + PyUnicode_GET_SIZE(self);
7401 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 if (!Py_UNICODE_ISDECIMAL(*p))
7403 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007405 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406}
7407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007411Return True if all characters in S are digits\n\
7412and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007415unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7418 register const Py_UNICODE *e;
7419
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 /* Shortcut for single character strings */
7421 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 Py_UNICODE_ISDIGIT(*p))
7423 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007425 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007426 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007428
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 e = p + PyUnicode_GET_SIZE(self);
7430 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 if (!Py_UNICODE_ISDIGIT(*p))
7432 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007440Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
7443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007444unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445{
7446 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7447 register const Py_UNICODE *e;
7448
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 /* Shortcut for single character strings */
7450 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 Py_UNICODE_ISNUMERIC(*p))
7452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007454 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007455 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007457
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 e = p + PyUnicode_GET_SIZE(self);
7459 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 if (!Py_UNICODE_ISNUMERIC(*p))
7461 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007463 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464}
7465
Martin v. Löwis47383402007-08-15 07:32:56 +00007466int
7467PyUnicode_IsIdentifier(PyObject *self)
7468{
7469 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7470 register const Py_UNICODE *e;
7471
7472 /* Special case for empty strings */
7473 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007475
7476 /* PEP 3131 says that the first character must be in
7477 XID_Start and subsequent characters in XID_Continue,
7478 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007480 letters, digits, underscore). However, given the current
7481 definition of XID_Start and XID_Continue, it is sufficient
7482 to check just for these, except that _ must be allowed
7483 as starting an identifier. */
7484 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7485 return 0;
7486
7487 e = p + PyUnicode_GET_SIZE(self);
7488 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 if (!_PyUnicode_IsXidContinue(*p))
7490 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007491 }
7492 return 1;
7493}
7494
7495PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007497\n\
7498Return True if S is a valid identifier according\n\
7499to the language definition.");
7500
7501static PyObject*
7502unicode_isidentifier(PyObject *self)
7503{
7504 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7505}
7506
Georg Brandl559e5d72008-06-11 18:37:52 +00007507PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007509\n\
7510Return True if all characters in S are considered\n\
7511printable in repr() or S is empty, False otherwise.");
7512
7513static PyObject*
7514unicode_isprintable(PyObject *self)
7515{
7516 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7517 register const Py_UNICODE *e;
7518
7519 /* Shortcut for single character strings */
7520 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7521 Py_RETURN_TRUE;
7522 }
7523
7524 e = p + PyUnicode_GET_SIZE(self);
7525 for (; p < e; p++) {
7526 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7527 Py_RETURN_FALSE;
7528 }
7529 }
7530 Py_RETURN_TRUE;
7531}
7532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535\n\
7536Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538
7539static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007540unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007542 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543}
7544
Martin v. Löwis18e16552006-02-15 17:27:45 +00007545static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546unicode_length(PyUnicodeObject *self)
7547{
7548 return self->length;
7549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007554Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007555done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject *
7558unicode_ljust(PyUnicodeObject *self, PyObject *args)
7559{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007560 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007561 Py_UNICODE fillchar = ' ';
7562
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007563 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565
Tim Peters7a29bd52001-09-12 03:03:31 +00007566 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 Py_INCREF(self);
7568 return (PyObject*) self;
7569 }
7570
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007571 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572}
7573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007577Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578
7579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007580unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 return fixup(self, fixlower);
7583}
7584
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007585#define LEFTSTRIP 0
7586#define RIGHTSTRIP 1
7587#define BOTHSTRIP 2
7588
7589/* Arrays indexed by above */
7590static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7591
7592#define STRIPNAME(i) (stripformat[i]+3)
7593
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007594/* externally visible for str.strip(unicode) */
7595PyObject *
7596_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7597{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7599 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7600 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7601 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7602 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007603
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007605
Benjamin Peterson14339b62009-01-31 16:36:08 +00007606 i = 0;
7607 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7609 i++;
7610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007612
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 j = len;
7614 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 do {
7616 j--;
7617 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7618 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007619 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007620
Benjamin Peterson14339b62009-01-31 16:36:08 +00007621 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 Py_INCREF(self);
7623 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 }
7625 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007627}
7628
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007631do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7634 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007635
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 i = 0;
7637 if (striptype != RIGHTSTRIP) {
7638 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7639 i++;
7640 }
7641 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007642
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643 j = len;
7644 if (striptype != LEFTSTRIP) {
7645 do {
7646 j--;
7647 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7648 j++;
7649 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007650
Benjamin Peterson14339b62009-01-31 16:36:08 +00007651 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7652 Py_INCREF(self);
7653 return (PyObject*)self;
7654 }
7655 else
7656 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007659
7660static PyObject *
7661do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7662{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7666 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007667
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 if (sep != NULL && sep != Py_None) {
7669 if (PyUnicode_Check(sep))
7670 return _PyUnicode_XStrip(self, striptype, sep);
7671 else {
7672 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 "%s arg must be None or str",
7674 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 return NULL;
7676 }
7677 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007678
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007680}
7681
7682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007683PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007685\n\
7686Return a copy of the string S with leading and trailing\n\
7687whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007688If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689
7690static PyObject *
7691unicode_strip(PyUnicodeObject *self, PyObject *args)
7692{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693 if (PyTuple_GET_SIZE(args) == 0)
7694 return do_strip(self, BOTHSTRIP); /* Common case */
7695 else
7696 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697}
7698
7699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007700PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702\n\
7703Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007704If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705
7706static PyObject *
7707unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709 if (PyTuple_GET_SIZE(args) == 0)
7710 return do_strip(self, LEFTSTRIP); /* Common case */
7711 else
7712 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713}
7714
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007718\n\
7719Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007720If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
7722static PyObject *
7723unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 if (PyTuple_GET_SIZE(args) == 0)
7726 return do_strip(self, RIGHTSTRIP); /* Common case */
7727 else
7728 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729}
7730
7731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007733unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734{
7735 PyUnicodeObject *u;
7736 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007738 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739
7740 if (len < 0)
7741 len = 0;
7742
Tim Peters7a29bd52001-09-12 03:03:31 +00007743 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 /* no repeat, return original string */
7745 Py_INCREF(str);
7746 return (PyObject*) str;
7747 }
Tim Peters8f422462000-09-09 06:13:41 +00007748
7749 /* ensure # of chars needed doesn't overflow int and # of bytes
7750 * needed doesn't overflow size_t
7751 */
7752 nchars = len * str->length;
7753 if (len && nchars / len != str->length) {
7754 PyErr_SetString(PyExc_OverflowError,
7755 "repeated string is too long");
7756 return NULL;
7757 }
7758 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7759 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7760 PyErr_SetString(PyExc_OverflowError,
7761 "repeated string is too long");
7762 return NULL;
7763 }
7764 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 if (!u)
7766 return NULL;
7767
7768 p = u->str;
7769
Thomas Wouters477c8d52006-05-27 19:21:47 +00007770 if (str->length == 1 && len > 0) {
7771 Py_UNICODE_FILL(p, str->str[0], len);
7772 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 Py_ssize_t done = 0; /* number of characters copied this far */
7774 if (done < nchars) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007775 Py_UNICODE_COPY(p, str->str, str->length);
7776 done = str->length;
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 }
7778 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007779 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007780 Py_UNICODE_COPY(p+done, p, n);
7781 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 }
7784
7785 return (PyObject*) u;
7786}
7787
7788PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 PyObject *subobj,
7790 PyObject *replobj,
7791 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792{
7793 PyObject *self;
7794 PyObject *str1;
7795 PyObject *str2;
7796 PyObject *result;
7797
7798 self = PyUnicode_FromObject(obj);
7799 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 str1 = PyUnicode_FromObject(subobj);
7802 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_DECREF(self);
7804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 }
7806 str2 = PyUnicode_FromObject(replobj);
7807 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 Py_DECREF(self);
7809 Py_DECREF(str1);
7810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 }
Tim Petersced69f82003-09-16 20:30:58 +00007812 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 (PyUnicodeObject *)str1,
7814 (PyUnicodeObject *)str2,
7815 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 Py_DECREF(self);
7817 Py_DECREF(str1);
7818 Py_DECREF(str2);
7819 return result;
7820}
7821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824\n\
7825Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007826old replaced by new. If the optional argument count is\n\
7827given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
7829static PyObject*
7830unicode_replace(PyUnicodeObject *self, PyObject *args)
7831{
7832 PyUnicodeObject *str1;
7833 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 PyObject *result;
7836
Martin v. Löwis18e16552006-02-15 17:27:45 +00007837 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 return NULL;
7839 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7840 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007843 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 Py_DECREF(str1);
7845 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847
7848 result = replace(self, str1, str2, maxcount);
7849
7850 Py_DECREF(str1);
7851 Py_DECREF(str2);
7852 return result;
7853}
7854
7855static
7856PyObject *unicode_repr(PyObject *unicode)
7857{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007858 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007859 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007860 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7861 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7862
7863 /* XXX(nnorwitz): rather than over-allocating, it would be
7864 better to choose a different scheme. Perhaps scan the
7865 first N-chars of the string and allocate based on that size.
7866 */
7867 /* Initial allocation is based on the longest-possible unichr
7868 escape.
7869
7870 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7871 unichr, so in this case it's the longest unichr escape. In
7872 narrow (UTF-16) builds this is five chars per source unichr
7873 since there are two unichrs in the surrogate pair, so in narrow
7874 (UTF-16) builds it's not the longest unichr escape.
7875
7876 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7877 so in the narrow (UTF-16) build case it's the longest unichr
7878 escape.
7879 */
7880
Walter Dörwald1ab83302007-05-18 17:15:44 +00007881 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007883#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007885#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007887#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007889 if (repr == NULL)
7890 return NULL;
7891
Walter Dörwald1ab83302007-05-18 17:15:44 +00007892 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007893
7894 /* Add quote */
7895 *p++ = (findchar(s, size, '\'') &&
7896 !findchar(s, size, '"')) ? '"' : '\'';
7897 while (size-- > 0) {
7898 Py_UNICODE ch = *s++;
7899
7900 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007901 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007902 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007903 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007904 continue;
7905 }
7906
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007908 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007909 *p++ = '\\';
7910 *p++ = 't';
7911 }
7912 else if (ch == '\n') {
7913 *p++ = '\\';
7914 *p++ = 'n';
7915 }
7916 else if (ch == '\r') {
7917 *p++ = '\\';
7918 *p++ = 'r';
7919 }
7920
7921 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007922 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007923 *p++ = '\\';
7924 *p++ = 'x';
7925 *p++ = hexdigits[(ch >> 4) & 0x000F];
7926 *p++ = hexdigits[ch & 0x000F];
7927 }
7928
Georg Brandl559e5d72008-06-11 18:37:52 +00007929 /* Copy ASCII characters as-is */
7930 else if (ch < 0x7F) {
7931 *p++ = ch;
7932 }
7933
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007935 else {
7936 Py_UCS4 ucs = ch;
7937
7938#ifndef Py_UNICODE_WIDE
7939 Py_UNICODE ch2 = 0;
7940 /* Get code point from surrogate pair */
7941 if (size > 0) {
7942 ch2 = *s;
7943 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007948 size--;
7949 }
7950 }
7951#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007952 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007953 (categories Z* and C* except ASCII space)
7954 */
7955 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7956 /* Map 8-bit characters to '\xhh' */
7957 if (ucs <= 0xff) {
7958 *p++ = '\\';
7959 *p++ = 'x';
7960 *p++ = hexdigits[(ch >> 4) & 0x000F];
7961 *p++ = hexdigits[ch & 0x000F];
7962 }
7963 /* Map 21-bit characters to '\U00xxxxxx' */
7964 else if (ucs >= 0x10000) {
7965 *p++ = '\\';
7966 *p++ = 'U';
7967 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7968 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7969 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7970 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7971 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7972 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7973 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7974 *p++ = hexdigits[ucs & 0x0000000F];
7975 }
7976 /* Map 16-bit characters to '\uxxxx' */
7977 else {
7978 *p++ = '\\';
7979 *p++ = 'u';
7980 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7981 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7982 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7983 *p++ = hexdigits[ucs & 0x000F];
7984 }
7985 }
7986 /* Copy characters as-is */
7987 else {
7988 *p++ = ch;
7989#ifndef Py_UNICODE_WIDE
7990 if (ucs >= 0x10000)
7991 *p++ = ch2;
7992#endif
7993 }
7994 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007995 }
7996 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007997 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007998
7999 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008000 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008001 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008004PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006\n\
8007Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008008such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009arguments start and end are interpreted as in slice notation.\n\
8010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008011Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013static PyObject *
8014unicode_rfind(PyUnicodeObject *self, PyObject *args)
8015{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008016 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008017 Py_ssize_t start;
8018 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008019 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Christian Heimes9cd17752007-11-18 19:35:23 +00008021 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Thomas Wouters477c8d52006-05-27 19:21:47 +00008024 result = stringlib_rfind_slice(
8025 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8026 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8027 start, end
8028 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029
8030 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008031
Christian Heimes217cfd12007-12-02 14:31:20 +00008032 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033}
8034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008035PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008038Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039
8040static PyObject *
8041unicode_rindex(PyUnicodeObject *self, PyObject *args)
8042{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008044 Py_ssize_t start;
8045 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008046 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
Christian Heimes9cd17752007-11-18 19:35:23 +00008048 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
Thomas Wouters477c8d52006-05-27 19:21:47 +00008051 result = stringlib_rfind_slice(
8052 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8053 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8054 start, end
8055 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
8057 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008058
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 if (result < 0) {
8060 PyErr_SetString(PyExc_ValueError, "substring not found");
8061 return NULL;
8062 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008063 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064}
8065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008066PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008069Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008070done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072static PyObject *
8073unicode_rjust(PyUnicodeObject *self, PyObject *args)
8074{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008075 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008076 Py_UNICODE fillchar = ' ';
8077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008078 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 return NULL;
8080
Tim Peters7a29bd52001-09-12 03:03:31 +00008081 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 Py_INCREF(self);
8083 return (PyObject*) self;
8084 }
8085
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008086 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087}
8088
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 PyObject *sep,
8091 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
8093 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008094
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 s = PyUnicode_FromObject(s);
8096 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 if (sep != NULL) {
8099 sep = PyUnicode_FromObject(sep);
8100 if (sep == NULL) {
8101 Py_DECREF(s);
8102 return NULL;
8103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
8105
8106 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8107
8108 Py_DECREF(s);
8109 Py_XDECREF(sep);
8110 return result;
8111}
8112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008113PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115\n\
8116Return a list of the words in S, using sep as the\n\
8117delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008118splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008119whitespace string is a separator and empty strings are\n\
8120removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121
8122static PyObject*
8123unicode_split(PyUnicodeObject *self, PyObject *args)
8124{
8125 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008126 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return NULL;
8130
8131 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137}
8138
Thomas Wouters477c8d52006-05-27 19:21:47 +00008139PyObject *
8140PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8141{
8142 PyObject* str_obj;
8143 PyObject* sep_obj;
8144 PyObject* out;
8145
8146 str_obj = PyUnicode_FromObject(str_in);
8147 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008149 sep_obj = PyUnicode_FromObject(sep_in);
8150 if (!sep_obj) {
8151 Py_DECREF(str_obj);
8152 return NULL;
8153 }
8154
8155 out = stringlib_partition(
8156 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8157 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8158 );
8159
8160 Py_DECREF(sep_obj);
8161 Py_DECREF(str_obj);
8162
8163 return out;
8164}
8165
8166
8167PyObject *
8168PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8169{
8170 PyObject* str_obj;
8171 PyObject* sep_obj;
8172 PyObject* out;
8173
8174 str_obj = PyUnicode_FromObject(str_in);
8175 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008177 sep_obj = PyUnicode_FromObject(sep_in);
8178 if (!sep_obj) {
8179 Py_DECREF(str_obj);
8180 return NULL;
8181 }
8182
8183 out = stringlib_rpartition(
8184 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8185 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8186 );
8187
8188 Py_DECREF(sep_obj);
8189 Py_DECREF(str_obj);
8190
8191 return out;
8192}
8193
8194PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008196\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008197Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008199found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200
8201static PyObject*
8202unicode_partition(PyUnicodeObject *self, PyObject *separator)
8203{
8204 return PyUnicode_Partition((PyObject *)self, separator);
8205}
8206
8207PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008210Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008212separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008213
8214static PyObject*
8215unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8216{
8217 return PyUnicode_RPartition((PyObject *)self, separator);
8218}
8219
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008220PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 PyObject *sep,
8222 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008223{
8224 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008225
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008226 s = PyUnicode_FromObject(s);
8227 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 if (sep != NULL) {
8230 sep = PyUnicode_FromObject(sep);
8231 if (sep == NULL) {
8232 Py_DECREF(s);
8233 return NULL;
8234 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008235 }
8236
8237 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8238
8239 Py_DECREF(s);
8240 Py_XDECREF(sep);
8241 return result;
8242}
8243
8244PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008246\n\
8247Return a list of the words in S, using sep as the\n\
8248delimiter string, starting at the end of the string and\n\
8249working to the front. If maxsplit is given, at most maxsplit\n\
8250splits are done. If sep is not specified, any whitespace string\n\
8251is a separator.");
8252
8253static PyObject*
8254unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8255{
8256 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008257 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008258
Martin v. Löwis18e16552006-02-15 17:27:45 +00008259 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008260 return NULL;
8261
8262 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008264 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008266 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008268}
8269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008270PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272\n\
8273Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008274Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008275is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
8277static PyObject*
8278unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8279{
Guido van Rossum86662912000-04-11 15:38:46 +00008280 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Guido van Rossum86662912000-04-11 15:38:46 +00008282 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 return NULL;
8284
Guido van Rossum86662912000-04-11 15:38:46 +00008285 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286}
8287
8288static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008289PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Walter Dörwald346737f2007-05-31 10:44:43 +00008291 if (PyUnicode_CheckExact(self)) {
8292 Py_INCREF(self);
8293 return self;
8294 } else
8295 /* Subtype -- return genuine unicode string with the same value. */
8296 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8297 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298}
8299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008300PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302\n\
8303Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008304and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
8306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008307unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 return fixup(self, fixswapcase);
8310}
8311
Georg Brandlceee0772007-11-27 23:48:05 +00008312PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008314\n\
8315Return a translation table usable for str.translate().\n\
8316If there is only one argument, it must be a dictionary mapping Unicode\n\
8317ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008318Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008319If there are two arguments, they must be strings of equal length, and\n\
8320in the resulting dictionary, each character in x will be mapped to the\n\
8321character at the same position in y. If there is a third argument, it\n\
8322must be a string, whose characters will be mapped to None in the result.");
8323
8324static PyObject*
8325unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8326{
8327 PyObject *x, *y = NULL, *z = NULL;
8328 PyObject *new = NULL, *key, *value;
8329 Py_ssize_t i = 0;
8330 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008331
Georg Brandlceee0772007-11-27 23:48:05 +00008332 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8333 return NULL;
8334 new = PyDict_New();
8335 if (!new)
8336 return NULL;
8337 if (y != NULL) {
8338 /* x must be a string too, of equal length */
8339 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8340 if (!PyUnicode_Check(x)) {
8341 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8342 "be a string if there is a second argument");
8343 goto err;
8344 }
8345 if (PyUnicode_GET_SIZE(x) != ylen) {
8346 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8347 "arguments must have equal length");
8348 goto err;
8349 }
8350 /* create entries for translating chars in x to those in y */
8351 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008352 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8353 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008354 if (!key || !value)
8355 goto err;
8356 res = PyDict_SetItem(new, key, value);
8357 Py_DECREF(key);
8358 Py_DECREF(value);
8359 if (res < 0)
8360 goto err;
8361 }
8362 /* create entries for deleting chars in z */
8363 if (z != NULL) {
8364 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008366 if (!key)
8367 goto err;
8368 res = PyDict_SetItem(new, key, Py_None);
8369 Py_DECREF(key);
8370 if (res < 0)
8371 goto err;
8372 }
8373 }
8374 } else {
8375 /* x must be a dict */
8376 if (!PyDict_Check(x)) {
8377 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8378 "to maketrans it must be a dict");
8379 goto err;
8380 }
8381 /* copy entries into the new dict, converting string keys to int keys */
8382 while (PyDict_Next(x, &i, &key, &value)) {
8383 if (PyUnicode_Check(key)) {
8384 /* convert string keys to integer keys */
8385 PyObject *newkey;
8386 if (PyUnicode_GET_SIZE(key) != 1) {
8387 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8388 "table must be of length 1");
8389 goto err;
8390 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008391 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008392 if (!newkey)
8393 goto err;
8394 res = PyDict_SetItem(new, newkey, value);
8395 Py_DECREF(newkey);
8396 if (res < 0)
8397 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008398 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008399 /* just keep integer keys */
8400 if (PyDict_SetItem(new, key, value) < 0)
8401 goto err;
8402 } else {
8403 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8404 "be strings or integers");
8405 goto err;
8406 }
8407 }
8408 }
8409 return new;
8410 err:
8411 Py_DECREF(new);
8412 return NULL;
8413}
8414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008415PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417\n\
8418Return a copy of the string S, where all characters have been mapped\n\
8419through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008420Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008421Unmapped characters are left untouched. Characters mapped to None\n\
8422are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423
8424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008425unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426{
Georg Brandlceee0772007-11-27 23:48:05 +00008427 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428}
8429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008430PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008433Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434
8435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008436unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 return fixup(self, fixupper);
8439}
8440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008441PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008444Pad a numeric string S with zeros on the left, to fill a field\n\
8445of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
8447static PyObject *
8448unicode_zfill(PyUnicodeObject *self, PyObject *args)
8449{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 PyUnicodeObject *u;
8452
Martin v. Löwis18e16552006-02-15 17:27:45 +00008453 Py_ssize_t width;
8454 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 return NULL;
8456
8457 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008458 if (PyUnicode_CheckExact(self)) {
8459 Py_INCREF(self);
8460 return (PyObject*) self;
8461 }
8462 else
8463 return PyUnicode_FromUnicode(
8464 PyUnicode_AS_UNICODE(self),
8465 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
8468
8469 fill = width - self->length;
8470
8471 u = pad(self, fill, 0, '0');
8472
Walter Dörwald068325e2002-04-15 13:36:47 +00008473 if (u == NULL)
8474 return NULL;
8475
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 if (u->str[fill] == '+' || u->str[fill] == '-') {
8477 /* move sign to beginning of string */
8478 u->str[0] = u->str[fill];
8479 u->str[fill] = '0';
8480 }
8481
8482 return (PyObject*) u;
8483}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485#if 0
8486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008487unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488{
Christian Heimes2202f872008-02-06 14:31:34 +00008489 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490}
8491#endif
8492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008493PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008496Return True if S starts with the specified prefix, False otherwise.\n\
8497With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008498With optional end, stop comparing S at that position.\n\
8499prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
8501static PyObject *
8502unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008505 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008508 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008509 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008511 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8513 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008514 if (PyTuple_Check(subobj)) {
8515 Py_ssize_t i;
8516 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8517 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008519 if (substring == NULL)
8520 return NULL;
8521 result = tailmatch(self, substring, start, end, -1);
8522 Py_DECREF(substring);
8523 if (result) {
8524 Py_RETURN_TRUE;
8525 }
8526 }
8527 /* nothing matched */
8528 Py_RETURN_FALSE;
8529 }
8530 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008533 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008535 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536}
8537
8538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008539PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008542Return True if S ends with the specified suffix, False otherwise.\n\
8543With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008544With optional end, stop comparing S at that position.\n\
8545suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
8547static PyObject *
8548unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008553 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008554 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008555 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008557 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8559 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008560 if (PyTuple_Check(subobj)) {
8561 Py_ssize_t i;
8562 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8563 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008565 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567 result = tailmatch(self, substring, start, end, +1);
8568 Py_DECREF(substring);
8569 if (result) {
8570 Py_RETURN_TRUE;
8571 }
8572 }
8573 Py_RETURN_FALSE;
8574 }
8575 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008579 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008581 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582}
8583
Eric Smith8c663262007-08-25 02:26:07 +00008584#include "stringlib/string_format.h"
8585
8586PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008588\n\
8589");
8590
Eric Smith4a7d76d2008-05-30 18:10:19 +00008591static PyObject *
8592unicode__format__(PyObject* self, PyObject* args)
8593{
8594 PyObject *format_spec;
8595
8596 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8597 return NULL;
8598
8599 return _PyUnicode_FormatAdvanced(self,
8600 PyUnicode_AS_UNICODE(format_spec),
8601 PyUnicode_GET_SIZE(format_spec));
8602}
8603
Eric Smith8c663262007-08-25 02:26:07 +00008604PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008606\n\
8607");
8608
8609static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008610unicode__sizeof__(PyUnicodeObject *v)
8611{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008612 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8613 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008614}
8615
8616PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008618
8619static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008620unicode_getnewargs(PyUnicodeObject *v)
8621{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008623}
8624
8625
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626static PyMethodDef unicode_methods[] = {
8627
8628 /* Order is according to common usage: often used methods should
8629 appear first, since lookup is done sequentially. */
8630
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008631 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8632 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8633 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008634 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008635 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8636 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8637 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8638 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8639 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8640 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8641 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008643 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8644 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8645 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008646 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008647 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8648 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8649 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008650 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008651 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008652 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008653 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008654 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8655 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8656 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8657 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8658 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8659 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8660 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8661 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8662 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8663 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8664 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8665 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8666 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8667 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008668 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008669 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008670 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008671 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008672 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008673 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8674 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008675 {"maketrans", (PyCFunction) unicode_maketrans,
8676 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008677 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008678#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008679 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680#endif
8681
8682#if 0
8683 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008684 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685#endif
8686
Benjamin Peterson14339b62009-01-31 16:36:08 +00008687 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 {NULL, NULL}
8689};
8690
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008691static PyObject *
8692unicode_mod(PyObject *v, PyObject *w)
8693{
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 if (!PyUnicode_Check(v)) {
8695 Py_INCREF(Py_NotImplemented);
8696 return Py_NotImplemented;
8697 }
8698 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008699}
8700
8701static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 0, /*nb_add*/
8703 0, /*nb_subtract*/
8704 0, /*nb_multiply*/
8705 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008706};
8707
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709 (lenfunc) unicode_length, /* sq_length */
8710 PyUnicode_Concat, /* sq_concat */
8711 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8712 (ssizeargfunc) unicode_getitem, /* sq_item */
8713 0, /* sq_slice */
8714 0, /* sq_ass_item */
8715 0, /* sq_ass_slice */
8716 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717};
8718
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008719static PyObject*
8720unicode_subscript(PyUnicodeObject* self, PyObject* item)
8721{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008722 if (PyIndex_Check(item)) {
8723 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008724 if (i == -1 && PyErr_Occurred())
8725 return NULL;
8726 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008727 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008728 return unicode_getitem(self, i);
8729 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008731 Py_UNICODE* source_buf;
8732 Py_UNICODE* result_buf;
8733 PyObject* result;
8734
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008735 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008737 return NULL;
8738 }
8739
8740 if (slicelength <= 0) {
8741 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008742 } else if (start == 0 && step == 1 && slicelength == self->length &&
8743 PyUnicode_CheckExact(self)) {
8744 Py_INCREF(self);
8745 return (PyObject *)self;
8746 } else if (step == 1) {
8747 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008748 } else {
8749 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008750 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8751 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 if (result_buf == NULL)
8754 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008755
8756 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8757 result_buf[i] = source_buf[cur];
8758 }
Tim Petersced69f82003-09-16 20:30:58 +00008759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008760 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008761 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008762 return result;
8763 }
8764 } else {
8765 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8766 return NULL;
8767 }
8768}
8769
8770static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771 (lenfunc)unicode_length, /* mp_length */
8772 (binaryfunc)unicode_subscript, /* mp_subscript */
8773 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008774};
8775
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777/* Helpers for PyUnicode_Format() */
8778
8779static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008780getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008782 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 (*p_argidx)++;
8785 if (arglen < 0)
8786 return args;
8787 else
8788 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
8790 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 return NULL;
8793}
8794
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008796strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008798 register Py_ssize_t i;
8799 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 for (i = len - 1; i >= 0; i--)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 return len;
8804}
8805
Neal Norwitzfc76d632006-01-10 06:03:13 +00008806static int
8807doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8808{
Tim Peters15231542006-02-16 01:08:01 +00008809 Py_ssize_t result;
8810
Neal Norwitzfc76d632006-01-10 06:03:13 +00008811 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008812 result = strtounicode(buffer, (char *)buffer);
8813 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008814}
8815
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008817static int
8818longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8819{
Tim Peters15231542006-02-16 01:08:01 +00008820 Py_ssize_t result;
8821
Neal Norwitzfc76d632006-01-10 06:03:13 +00008822 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008823 result = strtounicode(buffer, (char *)buffer);
8824 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008825}
Christian Heimes3fd13992008-03-21 01:05:49 +00008826#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008827
Guido van Rossum078151d2002-08-11 04:24:12 +00008828/* XXX To save some code duplication, formatfloat/long/int could have been
8829 shared with stringobject.c, converting from 8-bit to Unicode after the
8830 formatting is done. */
8831
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832static int
8833formatfloat(Py_UNICODE *buf,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 size_t buflen,
8835 int flags,
8836 int prec,
8837 int type,
8838 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008840 /* fmt = '%#.' + `prec` + `type`
8841 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 char fmt[20];
8843 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008844
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845 x = PyFloat_AsDouble(v);
8846 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008850 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008852 /* Worst case length calc to ensure no buffer overrun:
8853
8854 'g' formats:
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 fmt = %#.<prec>g
8856 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8857 for any double rep.)
8858 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008859
8860 'f' formats:
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8862 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008863
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008864 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008865 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008866
8867 */
Benjamin Peterson14339b62009-01-31 16:36:08 +00008868 if (((type == 'g' || type == 'G') &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 buflen <= (size_t)10 + (size_t)prec) ||
8870 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8871 PyErr_SetString(PyExc_OverflowError,
8872 "formatted float is too long (precision too large?)");
8873 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008874 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008875 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 (flags&F_ALT) ? "#" : "",
8877 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008878 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879}
8880
Tim Peters38fd5b62000-09-21 05:43:11 +00008881static PyObject*
8882formatlong(PyObject *val, int flags, int prec, int type)
8883{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 char *buf;
8885 int len;
8886 PyObject *str; /* temporary string object. */
8887 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008888
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8890 if (!str)
8891 return NULL;
8892 result = PyUnicode_FromStringAndSize(buf, len);
8893 Py_DECREF(str);
8894 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008895}
8896
Christian Heimes3fd13992008-03-21 01:05:49 +00008897#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898static int
8899formatint(Py_UNICODE *buf,
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 size_t buflen,
8901 int flags,
8902 int prec,
8903 int type,
8904 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008906 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008907 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8908 * + 1 + 1
8909 * = 24
8910 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008911 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008912 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 long x;
8914
Christian Heimes217cfd12007-12-02 14:31:20 +00008915 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008917 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008918 if (x < 0 && type == 'u') {
8919 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008920 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008921 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8922 sign = "-";
8923 else
8924 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008926 prec = 1;
8927
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008928 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8929 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008930 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008931 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008932 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008934 return -1;
8935 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008936
8937 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008938 (type == 'x' || type == 'X' || type == 'o')) {
8939 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008940 * of issues that cause pain:
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008942 * - when 0 is being converted, the C standard leaves off
8943 * the '0x' or '0X', which is inconsistent with other
8944 * %#x/%#X conversions and inconsistent with Python's
8945 * hex() function
8946 * - there are platforms that violate the standard and
8947 * convert 0 with the '0x' or '0X'
8948 * (Metrowerks, Compaq Tru64)
8949 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008950 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008951 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008952 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008953 * We can achieve the desired consistency by inserting our
8954 * own '0x' or '0X' prefix, and substituting %x/%X in place
8955 * of %#x/%#X.
8956 *
8957 * Note that this is the same approach as used in
8958 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008959 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008960 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8961 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008962 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008963 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008964 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8965 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008966 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008967 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008968 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008969 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008970 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008971 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972}
Christian Heimes3fd13992008-03-21 01:05:49 +00008973#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974
8975static int
8976formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008977 size_t buflen,
8978 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008980 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008981 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 if (PyUnicode_GET_SIZE(v) == 1) {
8983 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8984 buf[1] = '\0';
8985 return 1;
8986 }
8987#ifndef Py_UNICODE_WIDE
8988 if (PyUnicode_GET_SIZE(v) == 2) {
8989 /* Decode a valid surrogate pair */
8990 int c0 = PyUnicode_AS_UNICODE(v)[0];
8991 int c1 = PyUnicode_AS_UNICODE(v)[1];
8992 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8993 0xDC00 <= c1 && c1 <= 0xDFFF) {
8994 buf[0] = c0;
8995 buf[1] = c1;
8996 buf[2] = '\0';
8997 return 2;
8998 }
8999 }
9000#endif
9001 goto onError;
9002 }
9003 else {
9004 /* Integer input truncated to a character */
9005 long x;
9006 x = PyLong_AsLong(v);
9007 if (x == -1 && PyErr_Occurred())
9008 goto onError;
9009
9010 if (x < 0 || x > 0x10ffff) {
9011 PyErr_SetString(PyExc_OverflowError,
9012 "%c arg not in range(0x110000)");
9013 return -1;
9014 }
9015
9016#ifndef Py_UNICODE_WIDE
9017 if (x > 0xffff) {
9018 x -= 0x10000;
9019 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9020 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9021 return 2;
9022 }
9023#endif
9024 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 buf[1] = '\0';
9026 return 1;
9027 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009028
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009030 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009032 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033}
9034
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009035/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9036
9037 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
9038 chars are formatted. XXX This is a magic number. Each formatting
9039 routine does bounds checking to ensure no overflow, but a better
9040 solution may be to malloc a buffer of appropriate size for each
9041 format. For now, the current solution is sufficient.
9042*/
9043#define FORMATBUFLEN (size_t)120
9044
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047{
9048 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009049 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 int args_owned = 0;
9051 PyUnicodeObject *result = NULL;
9052 PyObject *dict = NULL;
9053 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009054
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 PyErr_BadInternalCall();
9057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 }
9059 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009060 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062 fmt = PyUnicode_AS_UNICODE(uformat);
9063 fmtcnt = PyUnicode_GET_SIZE(uformat);
9064
9065 reslen = rescnt = fmtcnt + 100;
9066 result = _PyUnicode_New(reslen);
9067 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 res = PyUnicode_AS_UNICODE(result);
9070
9071 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 arglen = PyTuple_Size(args);
9073 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074 }
9075 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 arglen = -1;
9077 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009079 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009080 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082
9083 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 if (*fmt != '%') {
9085 if (--rescnt < 0) {
9086 rescnt = fmtcnt + 100;
9087 reslen += rescnt;
9088 if (_PyUnicode_Resize(&result, reslen) < 0)
9089 goto onError;
9090 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9091 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009094 }
9095 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 /* Got a format specifier */
9097 int flags = 0;
9098 Py_ssize_t width = -1;
9099 int prec = -1;
9100 Py_UNICODE c = '\0';
9101 Py_UNICODE fill;
9102 int isnumok;
9103 PyObject *v = NULL;
9104 PyObject *temp = NULL;
9105 Py_UNICODE *pbuf;
9106 Py_UNICODE sign;
9107 Py_ssize_t len;
9108 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 fmt++;
9111 if (*fmt == '(') {
9112 Py_UNICODE *keystart;
9113 Py_ssize_t keylen;
9114 PyObject *key;
9115 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009116
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 if (dict == NULL) {
9118 PyErr_SetString(PyExc_TypeError,
9119 "format requires a mapping");
9120 goto onError;
9121 }
9122 ++fmt;
9123 --fmtcnt;
9124 keystart = fmt;
9125 /* Skip over balanced parentheses */
9126 while (pcount > 0 && --fmtcnt >= 0) {
9127 if (*fmt == ')')
9128 --pcount;
9129 else if (*fmt == '(')
9130 ++pcount;
9131 fmt++;
9132 }
9133 keylen = fmt - keystart - 1;
9134 if (fmtcnt < 0 || pcount > 0) {
9135 PyErr_SetString(PyExc_ValueError,
9136 "incomplete format key");
9137 goto onError;
9138 }
9139#if 0
9140 /* keys are converted to strings using UTF-8 and
9141 then looked up since Python uses strings to hold
9142 variables names etc. in its namespaces and we
9143 wouldn't want to break common idioms. */
9144 key = PyUnicode_EncodeUTF8(keystart,
9145 keylen,
9146 NULL);
9147#else
9148 key = PyUnicode_FromUnicode(keystart, keylen);
9149#endif
9150 if (key == NULL)
9151 goto onError;
9152 if (args_owned) {
9153 Py_DECREF(args);
9154 args_owned = 0;
9155 }
9156 args = PyObject_GetItem(dict, key);
9157 Py_DECREF(key);
9158 if (args == NULL) {
9159 goto onError;
9160 }
9161 args_owned = 1;
9162 arglen = -1;
9163 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 while (--fmtcnt >= 0) {
9166 switch (c = *fmt++) {
9167 case '-': flags |= F_LJUST; continue;
9168 case '+': flags |= F_SIGN; continue;
9169 case ' ': flags |= F_BLANK; continue;
9170 case '#': flags |= F_ALT; continue;
9171 case '0': flags |= F_ZERO; continue;
9172 }
9173 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 if (c == '*') {
9176 v = getnextarg(args, arglen, &argidx);
9177 if (v == NULL)
9178 goto onError;
9179 if (!PyLong_Check(v)) {
9180 PyErr_SetString(PyExc_TypeError,
9181 "* wants int");
9182 goto onError;
9183 }
9184 width = PyLong_AsLong(v);
9185 if (width == -1 && PyErr_Occurred())
9186 goto onError;
9187 if (width < 0) {
9188 flags |= F_LJUST;
9189 width = -width;
9190 }
9191 if (--fmtcnt >= 0)
9192 c = *fmt++;
9193 }
9194 else if (c >= '0' && c <= '9') {
9195 width = c - '0';
9196 while (--fmtcnt >= 0) {
9197 c = *fmt++;
9198 if (c < '0' || c > '9')
9199 break;
9200 if ((width*10) / 10 != width) {
9201 PyErr_SetString(PyExc_ValueError,
9202 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 }
9205 width = width*10 + (c - '0');
9206 }
9207 }
9208 if (c == '.') {
9209 prec = 0;
9210 if (--fmtcnt >= 0)
9211 c = *fmt++;
9212 if (c == '*') {
9213 v = getnextarg(args, arglen, &argidx);
9214 if (v == NULL)
9215 goto onError;
9216 if (!PyLong_Check(v)) {
9217 PyErr_SetString(PyExc_TypeError,
9218 "* wants int");
9219 goto onError;
9220 }
9221 prec = PyLong_AsLong(v);
9222 if (prec == -1 && PyErr_Occurred())
9223 goto onError;
9224 if (prec < 0)
9225 prec = 0;
9226 if (--fmtcnt >= 0)
9227 c = *fmt++;
9228 }
9229 else if (c >= '0' && c <= '9') {
9230 prec = c - '0';
9231 while (--fmtcnt >= 0) {
9232 c = Py_CHARMASK(*fmt++);
9233 if (c < '0' || c > '9')
9234 break;
9235 if ((prec*10) / 10 != prec) {
9236 PyErr_SetString(PyExc_ValueError,
9237 "prec too big");
9238 goto onError;
9239 }
9240 prec = prec*10 + (c - '0');
9241 }
9242 }
9243 } /* prec */
9244 if (fmtcnt >= 0) {
9245 if (c == 'h' || c == 'l' || c == 'L') {
9246 if (--fmtcnt >= 0)
9247 c = *fmt++;
9248 }
9249 }
9250 if (fmtcnt < 0) {
9251 PyErr_SetString(PyExc_ValueError,
9252 "incomplete format");
9253 goto onError;
9254 }
9255 if (c != '%') {
9256 v = getnextarg(args, arglen, &argidx);
9257 if (v == NULL)
9258 goto onError;
9259 }
9260 sign = 0;
9261 fill = ' ';
9262 switch (c) {
9263
9264 case '%':
9265 pbuf = formatbuf;
9266 /* presume that buffer length is at least 1 */
9267 pbuf[0] = '%';
9268 len = 1;
9269 break;
9270
9271 case 's':
9272 case 'r':
9273 case 'a':
9274 if (PyUnicode_Check(v) && c == 's') {
9275 temp = v;
9276 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009277 }
9278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009279 if (c == 's')
9280 temp = PyObject_Str(v);
9281 else if (c == 'r')
9282 temp = PyObject_Repr(v);
9283 else
9284 temp = PyObject_ASCII(v);
9285 if (temp == NULL)
9286 goto onError;
9287 if (PyUnicode_Check(temp))
9288 /* nothing to do */;
9289 else {
9290 Py_DECREF(temp);
9291 PyErr_SetString(PyExc_TypeError,
9292 "%s argument has non-string str()");
9293 goto onError;
9294 }
9295 }
9296 pbuf = PyUnicode_AS_UNICODE(temp);
9297 len = PyUnicode_GET_SIZE(temp);
9298 if (prec >= 0 && len > prec)
9299 len = prec;
9300 break;
9301
9302 case 'i':
9303 case 'd':
9304 case 'u':
9305 case 'o':
9306 case 'x':
9307 case 'X':
9308 if (c == 'i')
9309 c = 'd';
9310 isnumok = 0;
9311 if (PyNumber_Check(v)) {
9312 PyObject *iobj=NULL;
9313
9314 if (PyLong_Check(v)) {
9315 iobj = v;
9316 Py_INCREF(iobj);
9317 }
9318 else {
9319 iobj = PyNumber_Long(v);
9320 }
9321 if (iobj!=NULL) {
9322 if (PyLong_Check(iobj)) {
9323 isnumok = 1;
9324 temp = formatlong(iobj, flags, prec, c);
9325 Py_DECREF(iobj);
9326 if (!temp)
9327 goto onError;
9328 pbuf = PyUnicode_AS_UNICODE(temp);
9329 len = PyUnicode_GET_SIZE(temp);
9330 sign = 1;
9331 }
9332 else {
9333 Py_DECREF(iobj);
9334 }
9335 }
9336 }
9337 if (!isnumok) {
9338 PyErr_Format(PyExc_TypeError,
9339 "%%%c format: a number is required, "
9340 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9341 goto onError;
9342 }
9343 if (flags & F_ZERO)
9344 fill = '0';
9345 break;
9346
9347 case 'e':
9348 case 'E':
9349 case 'f':
9350 case 'F':
9351 case 'g':
9352 case 'G':
9353 if (c == 'F')
9354 c = 'f';
9355 pbuf = formatbuf;
9356 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9357 flags, prec, c, v);
9358 if (len < 0)
9359 goto onError;
9360 sign = 1;
9361 if (flags & F_ZERO)
9362 fill = '0';
9363 break;
9364
9365 case 'c':
9366 pbuf = formatbuf;
9367 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9368 if (len < 0)
9369 goto onError;
9370 break;
9371
9372 default:
9373 PyErr_Format(PyExc_ValueError,
9374 "unsupported format character '%c' (0x%x) "
9375 "at index %zd",
9376 (31<=c && c<=126) ? (char)c : '?',
9377 (int)c,
9378 (Py_ssize_t)(fmt - 1 -
9379 PyUnicode_AS_UNICODE(uformat)));
9380 goto onError;
9381 }
9382 if (sign) {
9383 if (*pbuf == '-' || *pbuf == '+') {
9384 sign = *pbuf++;
9385 len--;
9386 }
9387 else if (flags & F_SIGN)
9388 sign = '+';
9389 else if (flags & F_BLANK)
9390 sign = ' ';
9391 else
9392 sign = 0;
9393 }
9394 if (width < len)
9395 width = len;
9396 if (rescnt - (sign != 0) < width) {
9397 reslen -= rescnt;
9398 rescnt = width + fmtcnt + 100;
9399 reslen += rescnt;
9400 if (reslen < 0) {
9401 Py_XDECREF(temp);
9402 PyErr_NoMemory();
9403 goto onError;
9404 }
9405 if (_PyUnicode_Resize(&result, reslen) < 0) {
9406 Py_XDECREF(temp);
9407 goto onError;
9408 }
9409 res = PyUnicode_AS_UNICODE(result)
9410 + reslen - rescnt;
9411 }
9412 if (sign) {
9413 if (fill != ' ')
9414 *res++ = sign;
9415 rescnt--;
9416 if (width > len)
9417 width--;
9418 }
9419 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9420 assert(pbuf[0] == '0');
9421 assert(pbuf[1] == c);
9422 if (fill != ' ') {
9423 *res++ = *pbuf++;
9424 *res++ = *pbuf++;
9425 }
9426 rescnt -= 2;
9427 width -= 2;
9428 if (width < 0)
9429 width = 0;
9430 len -= 2;
9431 }
9432 if (width > len && !(flags & F_LJUST)) {
9433 do {
9434 --rescnt;
9435 *res++ = fill;
9436 } while (--width > len);
9437 }
9438 if (fill == ' ') {
9439 if (sign)
9440 *res++ = sign;
9441 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9442 assert(pbuf[0] == '0');
9443 assert(pbuf[1] == c);
9444 *res++ = *pbuf++;
9445 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009446 }
9447 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 Py_UNICODE_COPY(res, pbuf, len);
9449 res += len;
9450 rescnt -= len;
9451 while (--width >= len) {
9452 --rescnt;
9453 *res++ = ' ';
9454 }
9455 if (dict && (argidx < arglen) && c != '%') {
9456 PyErr_SetString(PyExc_TypeError,
9457 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009458 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 goto onError;
9460 }
9461 Py_XDECREF(temp);
9462 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 } /* until end */
9464 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 PyErr_SetString(PyExc_TypeError,
9466 "not all arguments converted during string formatting");
9467 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 }
9469
Thomas Woutersa96affe2006-03-12 00:29:36 +00009470 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 }
9475 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 return (PyObject *)result;
9477
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 Py_XDECREF(result);
9480 Py_DECREF(uformat);
9481 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 }
9484 return NULL;
9485}
9486
Jeremy Hylton938ace62002-07-17 16:30:39 +00009487static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009488unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9489
Tim Peters6d6c1a32001-08-02 04:15:00 +00009490static PyObject *
9491unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9492{
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009494 static char *kwlist[] = {"object", "encoding", "errors", 0};
9495 char *encoding = NULL;
9496 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009497
Benjamin Peterson14339b62009-01-31 16:36:08 +00009498 if (type != &PyUnicode_Type)
9499 return unicode_subtype_new(type, args, kwds);
9500 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 return NULL;
9503 if (x == NULL)
9504 return (PyObject *)_PyUnicode_New(0);
9505 if (encoding == NULL && errors == NULL)
9506 return PyObject_Str(x);
9507 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009509}
9510
Guido van Rossume023fe02001-08-30 03:12:59 +00009511static PyObject *
9512unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9513{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009514 PyUnicodeObject *tmp, *pnew;
9515 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009516
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9518 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9519 if (tmp == NULL)
9520 return NULL;
9521 assert(PyUnicode_Check(tmp));
9522 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9523 if (pnew == NULL) {
9524 Py_DECREF(tmp);
9525 return NULL;
9526 }
9527 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9528 if (pnew->str == NULL) {
9529 _Py_ForgetReference((PyObject *)pnew);
9530 PyObject_Del(pnew);
9531 Py_DECREF(tmp);
9532 return PyErr_NoMemory();
9533 }
9534 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9535 pnew->length = n;
9536 pnew->hash = tmp->hash;
9537 Py_DECREF(tmp);
9538 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009539}
9540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009541PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009543\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009544Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009545encoding defaults to the current default string encoding.\n\
9546errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009547
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009548static PyObject *unicode_iter(PyObject *seq);
9549
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009551 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009552 "str", /* tp_name */
9553 sizeof(PyUnicodeObject), /* tp_size */
9554 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 (destructor)unicode_dealloc, /* tp_dealloc */
9557 0, /* tp_print */
9558 0, /* tp_getattr */
9559 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009560 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009561 unicode_repr, /* tp_repr */
9562 &unicode_as_number, /* tp_as_number */
9563 &unicode_as_sequence, /* tp_as_sequence */
9564 &unicode_as_mapping, /* tp_as_mapping */
9565 (hashfunc) unicode_hash, /* tp_hash*/
9566 0, /* tp_call*/
9567 (reprfunc) unicode_str, /* tp_str */
9568 PyObject_GenericGetAttr, /* tp_getattro */
9569 0, /* tp_setattro */
9570 0, /* tp_as_buffer */
9571 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009573 unicode_doc, /* tp_doc */
9574 0, /* tp_traverse */
9575 0, /* tp_clear */
9576 PyUnicode_RichCompare, /* tp_richcompare */
9577 0, /* tp_weaklistoffset */
9578 unicode_iter, /* tp_iter */
9579 0, /* tp_iternext */
9580 unicode_methods, /* tp_methods */
9581 0, /* tp_members */
9582 0, /* tp_getset */
9583 &PyBaseObject_Type, /* tp_base */
9584 0, /* tp_dict */
9585 0, /* tp_descr_get */
9586 0, /* tp_descr_set */
9587 0, /* tp_dictoffset */
9588 0, /* tp_init */
9589 0, /* tp_alloc */
9590 unicode_new, /* tp_new */
9591 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592};
9593
9594/* Initialize the Unicode implementation */
9595
Thomas Wouters78890102000-07-22 19:25:51 +00009596void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009598 int i;
9599
Thomas Wouters477c8d52006-05-27 19:21:47 +00009600 /* XXX - move this array to unicodectype.c ? */
9601 Py_UNICODE linebreak[] = {
9602 0x000A, /* LINE FEED */
9603 0x000D, /* CARRIAGE RETURN */
9604 0x001C, /* FILE SEPARATOR */
9605 0x001D, /* GROUP SEPARATOR */
9606 0x001E, /* RECORD SEPARATOR */
9607 0x0085, /* NEXT LINE */
9608 0x2028, /* LINE SEPARATOR */
9609 0x2029, /* PARAGRAPH SEPARATOR */
9610 };
9611
Fred Drakee4315f52000-05-09 19:53:39 +00009612 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009613 free_list = NULL;
9614 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009616 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009617 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009618
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009619 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009621 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009623
9624 /* initialize the linebreak bloom filter */
9625 bloom_linebreak = make_bloom_mask(
9626 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9627 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009628
9629 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630}
9631
9632/* Finalize the Unicode implementation */
9633
Christian Heimesa156e092008-02-16 07:38:31 +00009634int
9635PyUnicode_ClearFreeList(void)
9636{
9637 int freelist_size = numfree;
9638 PyUnicodeObject *u;
9639
9640 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 PyUnicodeObject *v = u;
9642 u = *(PyUnicodeObject **)u;
9643 if (v->str)
9644 PyObject_DEL(v->str);
9645 Py_XDECREF(v->defenc);
9646 PyObject_Del(v);
9647 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009648 }
9649 free_list = NULL;
9650 assert(numfree == 0);
9651 return freelist_size;
9652}
9653
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654void
Thomas Wouters78890102000-07-22 19:25:51 +00009655_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009657 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009659 Py_XDECREF(unicode_empty);
9660 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009661
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009662 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 if (unicode_latin1[i]) {
9664 Py_DECREF(unicode_latin1[i]);
9665 unicode_latin1[i] = NULL;
9666 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009667 }
Christian Heimesa156e092008-02-16 07:38:31 +00009668 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009670
Walter Dörwald16807132007-05-25 13:52:07 +00009671void
9672PyUnicode_InternInPlace(PyObject **p)
9673{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9675 PyObject *t;
9676 if (s == NULL || !PyUnicode_Check(s))
9677 Py_FatalError(
9678 "PyUnicode_InternInPlace: unicode strings only please!");
9679 /* If it's a subclass, we don't really know what putting
9680 it in the interned dict might do. */
9681 if (!PyUnicode_CheckExact(s))
9682 return;
9683 if (PyUnicode_CHECK_INTERNED(s))
9684 return;
9685 if (interned == NULL) {
9686 interned = PyDict_New();
9687 if (interned == NULL) {
9688 PyErr_Clear(); /* Don't leave an exception */
9689 return;
9690 }
9691 }
9692 /* It might be that the GetItem call fails even
9693 though the key is present in the dictionary,
9694 namely when this happens during a stack overflow. */
9695 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009697 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009698
Benjamin Peterson29060642009-01-31 22:14:21 +00009699 if (t) {
9700 Py_INCREF(t);
9701 Py_DECREF(*p);
9702 *p = t;
9703 return;
9704 }
Walter Dörwald16807132007-05-25 13:52:07 +00009705
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 PyThreadState_GET()->recursion_critical = 1;
9707 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9708 PyErr_Clear();
9709 PyThreadState_GET()->recursion_critical = 0;
9710 return;
9711 }
9712 PyThreadState_GET()->recursion_critical = 0;
9713 /* The two references in interned are not counted by refcnt.
9714 The deallocator will take care of this */
9715 Py_REFCNT(s) -= 2;
9716 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009717}
9718
9719void
9720PyUnicode_InternImmortal(PyObject **p)
9721{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009722 PyUnicode_InternInPlace(p);
9723 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9724 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9725 Py_INCREF(*p);
9726 }
Walter Dörwald16807132007-05-25 13:52:07 +00009727}
9728
9729PyObject *
9730PyUnicode_InternFromString(const char *cp)
9731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732 PyObject *s = PyUnicode_FromString(cp);
9733 if (s == NULL)
9734 return NULL;
9735 PyUnicode_InternInPlace(&s);
9736 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009737}
9738
9739void _Py_ReleaseInternedUnicodeStrings(void)
9740{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 PyObject *keys;
9742 PyUnicodeObject *s;
9743 Py_ssize_t i, n;
9744 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009745
Benjamin Peterson14339b62009-01-31 16:36:08 +00009746 if (interned == NULL || !PyDict_Check(interned))
9747 return;
9748 keys = PyDict_Keys(interned);
9749 if (keys == NULL || !PyList_Check(keys)) {
9750 PyErr_Clear();
9751 return;
9752 }
Walter Dörwald16807132007-05-25 13:52:07 +00009753
Benjamin Peterson14339b62009-01-31 16:36:08 +00009754 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9755 detector, interned unicode strings are not forcibly deallocated;
9756 rather, we give them their stolen references back, and then clear
9757 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009758
Benjamin Peterson14339b62009-01-31 16:36:08 +00009759 n = PyList_GET_SIZE(keys);
9760 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009762 for (i = 0; i < n; i++) {
9763 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9764 switch (s->state) {
9765 case SSTATE_NOT_INTERNED:
9766 /* XXX Shouldn't happen */
9767 break;
9768 case SSTATE_INTERNED_IMMORTAL:
9769 Py_REFCNT(s) += 1;
9770 immortal_size += s->length;
9771 break;
9772 case SSTATE_INTERNED_MORTAL:
9773 Py_REFCNT(s) += 2;
9774 mortal_size += s->length;
9775 break;
9776 default:
9777 Py_FatalError("Inconsistent interned string state.");
9778 }
9779 s->state = SSTATE_NOT_INTERNED;
9780 }
9781 fprintf(stderr, "total size of all interned strings: "
9782 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9783 "mortal/immortal\n", mortal_size, immortal_size);
9784 Py_DECREF(keys);
9785 PyDict_Clear(interned);
9786 Py_DECREF(interned);
9787 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009788}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009789
9790
9791/********************* Unicode Iterator **************************/
9792
9793typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009794 PyObject_HEAD
9795 Py_ssize_t it_index;
9796 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009797} unicodeiterobject;
9798
9799static void
9800unicodeiter_dealloc(unicodeiterobject *it)
9801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009802 _PyObject_GC_UNTRACK(it);
9803 Py_XDECREF(it->it_seq);
9804 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009805}
9806
9807static int
9808unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9809{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009810 Py_VISIT(it->it_seq);
9811 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009812}
9813
9814static PyObject *
9815unicodeiter_next(unicodeiterobject *it)
9816{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009817 PyUnicodeObject *seq;
9818 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009819
Benjamin Peterson14339b62009-01-31 16:36:08 +00009820 assert(it != NULL);
9821 seq = it->it_seq;
9822 if (seq == NULL)
9823 return NULL;
9824 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009825
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9827 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 if (item != NULL)
9830 ++it->it_index;
9831 return item;
9832 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009833
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 Py_DECREF(seq);
9835 it->it_seq = NULL;
9836 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009837}
9838
9839static PyObject *
9840unicodeiter_len(unicodeiterobject *it)
9841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 Py_ssize_t len = 0;
9843 if (it->it_seq)
9844 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9845 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009846}
9847
9848PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9849
9850static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009854};
9855
9856PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9858 "str_iterator", /* tp_name */
9859 sizeof(unicodeiterobject), /* tp_basicsize */
9860 0, /* tp_itemsize */
9861 /* methods */
9862 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9863 0, /* tp_print */
9864 0, /* tp_getattr */
9865 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009866 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867 0, /* tp_repr */
9868 0, /* tp_as_number */
9869 0, /* tp_as_sequence */
9870 0, /* tp_as_mapping */
9871 0, /* tp_hash */
9872 0, /* tp_call */
9873 0, /* tp_str */
9874 PyObject_GenericGetAttr, /* tp_getattro */
9875 0, /* tp_setattro */
9876 0, /* tp_as_buffer */
9877 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9878 0, /* tp_doc */
9879 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9880 0, /* tp_clear */
9881 0, /* tp_richcompare */
9882 0, /* tp_weaklistoffset */
9883 PyObject_SelfIter, /* tp_iter */
9884 (iternextfunc)unicodeiter_next, /* tp_iternext */
9885 unicodeiter_methods, /* tp_methods */
9886 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009887};
9888
9889static PyObject *
9890unicode_iter(PyObject *seq)
9891{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009892 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009893
Benjamin Peterson14339b62009-01-31 16:36:08 +00009894 if (!PyUnicode_Check(seq)) {
9895 PyErr_BadInternalCall();
9896 return NULL;
9897 }
9898 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9899 if (it == NULL)
9900 return NULL;
9901 it->it_index = 0;
9902 Py_INCREF(seq);
9903 it->it_seq = (PyUnicodeObject *)seq;
9904 _PyObject_GC_TRACK(it);
9905 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009906}
9907
Martin v. Löwis5b222132007-06-10 09:51:05 +00009908size_t
9909Py_UNICODE_strlen(const Py_UNICODE *u)
9910{
9911 int res = 0;
9912 while(*u++)
9913 res++;
9914 return res;
9915}
9916
9917Py_UNICODE*
9918Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9919{
9920 Py_UNICODE *u = s1;
9921 while ((*u++ = *s2++));
9922 return s1;
9923}
9924
9925Py_UNICODE*
9926Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9927{
9928 Py_UNICODE *u = s1;
9929 while ((*u++ = *s2++))
9930 if (n-- == 0)
9931 break;
9932 return s1;
9933}
9934
9935int
9936Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9937{
9938 while (*s1 && *s2 && *s1 == *s2)
9939 s1++, s2++;
9940 if (*s1 && *s2)
9941 return (*s1 < *s2) ? -1 : +1;
9942 if (*s1)
9943 return 1;
9944 if (*s2)
9945 return -1;
9946 return 0;
9947}
9948
9949Py_UNICODE*
9950Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9951{
9952 const Py_UNICODE *p;
9953 for (p = s; *p; p++)
9954 if (*p == c)
9955 return (Py_UNICODE*)p;
9956 return NULL;
9957}
9958
9959
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009960#ifdef __cplusplus
9961}
9962#endif
9963
9964
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009965/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 Local variables:
9967 c-basic-offset: 4
9968 indent-tabs-mode: nil
9969 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009970*/