blob: aeaa53b2d8c0cac1a353696a5212459a79f5e59b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000172
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Benjamin Peterson29060642009-01-31 22:14:21 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
Benjamin Peterson29060642009-01-31 22:14:21 +0000239#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000379 case SSTATE_NOT_INTERNED:
380 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
384 Py_REFCNT(unicode) = 3;
385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
387 "deletion of interned string failed");
388 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000389
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_INTERNED_IMMORTAL:
391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 default:
394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401 PyObject_DEL(unicode->str);
402 unicode->str = NULL;
403 unicode->length = 0;
404 }
405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 PyObject_DEL(unicode->str);
416 Py_XDECREF(unicode->defenc);
417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyErr_BadInternalCall();
429 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
447 Py_DECREF(*unicode);
448 *unicode = w;
449 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
Benjamin Peterson29060642009-01-31 22:14:21 +0000471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000475 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
483 if (!unicode)
484 return NULL;
485 unicode->str[0] = *u;
486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
Benjamin Peterson14339b62009-01-31 16:36:08 +0000508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000510 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000511 return NULL;
512 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000525
526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
529 unicode = unicode_latin1[Py_CHARMASK(*u)];
530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
534 unicode->str[0] = Py_CHARMASK(*u);
535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566{
567 PyUnicodeObject *unicode;
568
569 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000570 if (size == 0)
571 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000572 PyErr_BadInternalCall();
573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574 }
575
Martin v. Löwis790465f2008-04-05 20:41:37 +0000576 if (size == -1) {
577 size = wcslen(w);
578 }
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 unicode = _PyUnicode_New(size);
581 if (!unicode)
582 return NULL;
583
584 /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000587#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000589 register Py_UNICODE *u;
590 register Py_ssize_t i;
591 u = PyUnicode_AS_UNICODE(unicode);
592 for (i = size; i > 0; i--)
593 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 }
595#endif
596
597 return (PyObject *)unicode;
598}
599
Walter Dörwald346737f2007-05-31 10:44:43 +0000600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000603 *fmt++ = '%';
604 if (width) {
605 if (zeropad)
606 *fmt++ = '0';
607 fmt += sprintf(fmt, "%d", width);
608 }
609 if (precision)
610 fmt += sprintf(fmt, ".%d", precision);
611 if (longflag)
612 *fmt++ = 'l';
613 else if (size_tflag) {
614 char *f = PY_FORMAT_SIZE_T;
615 while (*f)
616 *fmt++ = *f++;
617 }
618 *fmt++ = c;
619 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000620}
621
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000627 va_list count;
628 Py_ssize_t callcount = 0;
629 PyObject **callresults = NULL;
630 PyObject **callresult = NULL;
631 Py_ssize_t n = 0;
632 int width = 0;
633 int precision = 0;
634 int zeropad;
635 const char* f;
636 Py_UNICODE *s;
637 PyObject *string;
638 /* used by sprintf */
639 char buffer[21];
640 /* use abuffer instead of buffer, if we need more space
641 * (which can happen if there's a format specifier with width). */
642 char *abuffer = NULL;
643 char *realbuffer;
644 Py_ssize_t abuffersize = 0;
645 char fmt[60]; /* should be enough for %0width.precisionld */
646 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000647
648#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000649 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000650#else
651#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000652 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000653#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000654 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655#endif
656#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000657 /* step 1: count the number of %S/%R/%A format specifications
658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659 * these objects once during step 3 and put the result in
Benjamin Peterson29060642009-01-31 22:14:21 +0000660 an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000661 for (f = format; *f; f++) {
662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
663 ++callcount;
664 }
665 /* step 2: allocate memory for the results of
666 * PyObject_Str()/PyObject_Repr() calls */
667 if (callcount) {
668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
669 if (!callresults) {
670 PyErr_NoMemory();
671 return NULL;
672 }
673 callresult = callresults;
674 }
675 /* step 3: figure out how large a buffer we need */
676 for (f = format; *f; f++) {
677 if (*f == '%') {
678 const char* p = f;
679 width = 0;
680 while (ISDIGIT((unsigned)*f))
681 width = (width*10) + *f++ - '0';
682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
683 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000684
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686 * they don't affect the amount of space we reserve.
687 */
688 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000689 (f[1] == 'd' || f[1] == 'u'))
690 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 switch (*f) {
693 case 'c':
694 (void)va_arg(count, int);
695 /* fall through... */
696 case '%':
697 n++;
698 break;
699 case 'd': case 'u': case 'i': case 'x':
700 (void) va_arg(count, int);
701 /* 20 bytes is enough to hold a 64-bit
702 integer. Decimal takes the most space.
703 This isn't enough for octal.
704 If a width is specified we need more
705 (which we allocate later). */
706 if (width < 20)
707 width = 20;
708 n += width;
709 if (abuffersize < width)
710 abuffersize = width;
711 break;
712 case 's':
713 {
714 /* UTF-8 */
715 unsigned char*s;
716 s = va_arg(count, unsigned char*);
717 while (*s) {
718 if (*s < 128) {
719 n++; s++;
720 } else if (*s < 0xc0) {
721 /* invalid UTF-8 */
722 n++; s++;
723 } else if (*s < 0xc0) {
724 n++;
725 s++; if(!*s)break;
726 s++;
727 } else if (*s < 0xe0) {
728 n++;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000733#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 n++;
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000736 n+=2;
Benjamin Peterson29060642009-01-31 22:14:21 +0000737#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 s++; if(!*s)break;
739 s++; if(!*s)break;
740 s++; if(!*s)break;
741 s++;
742 }
743 }
744 break;
745 }
746 case 'U':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 assert(obj && PyUnicode_Check(obj));
750 n += PyUnicode_GET_SIZE(obj);
751 break;
752 }
753 case 'V':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 const char *str = va_arg(count, const char *);
757 assert(obj || str);
758 assert(!obj || PyUnicode_Check(obj));
759 if (obj)
760 n += PyUnicode_GET_SIZE(obj);
761 else
762 n += strlen(str);
763 break;
764 }
765 case 'S':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 PyObject *str;
769 assert(obj);
770 str = PyObject_Str(obj);
771 if (!str)
772 goto fail;
773 n += PyUnicode_GET_SIZE(str);
774 /* Remember the str and switch to the next slot */
775 *callresult++ = str;
776 break;
777 }
778 case 'R':
779 {
780 PyObject *obj = va_arg(count, PyObject *);
781 PyObject *repr;
782 assert(obj);
783 repr = PyObject_Repr(obj);
784 if (!repr)
785 goto fail;
786 n += PyUnicode_GET_SIZE(repr);
787 /* Remember the repr and switch to the next slot */
788 *callresult++ = repr;
789 break;
790 }
791 case 'A':
792 {
793 PyObject *obj = va_arg(count, PyObject *);
794 PyObject *ascii;
795 assert(obj);
796 ascii = PyObject_ASCII(obj);
797 if (!ascii)
798 goto fail;
799 n += PyUnicode_GET_SIZE(ascii);
800 /* Remember the repr and switch to the next slot */
801 *callresult++ = ascii;
802 break;
803 }
804 case 'p':
805 (void) va_arg(count, int);
806 /* maximum 64-bit pointer representation:
807 * 0xffffffffffffffff
808 * so 19 characters is enough.
809 * XXX I count 18 -- what's the extra for?
810 */
811 n += 19;
812 break;
813 default:
814 /* if we stumble upon an unknown
815 formatting code, copy the rest of
816 the format string to the output
817 string. (we cannot just skip the
818 code, since there's no way to know
819 what's in the argument list) */
820 n += strlen(p);
821 goto expand;
822 }
823 } else
824 n++;
825 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000827 if (abuffersize > 20) {
828 abuffer = PyObject_Malloc(abuffersize);
829 if (!abuffer) {
830 PyErr_NoMemory();
831 goto fail;
832 }
833 realbuffer = abuffer;
834 }
835 else
836 realbuffer = buffer;
837 /* step 4: fill the buffer */
838 /* Since we've analyzed how much space we need for the worst case,
839 we don't have to resize the string.
840 There can be no errors beyond this point. */
841 string = PyUnicode_FromUnicode(NULL, n);
842 if (!string)
843 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844
Benjamin Peterson14339b62009-01-31 16:36:08 +0000845 s = PyUnicode_AS_UNICODE(string);
846 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847
Benjamin Peterson14339b62009-01-31 16:36:08 +0000848 for (f = format; *f; f++) {
849 if (*f == '%') {
850 const char* p = f++;
851 int longflag = 0;
852 int size_tflag = 0;
853 zeropad = (*f == '0');
854 /* parse the width.precision part */
855 width = 0;
856 while (ISDIGIT((unsigned)*f))
857 width = (width*10) + *f++ - '0';
858 precision = 0;
859 if (*f == '.') {
860 f++;
861 while (ISDIGIT((unsigned)*f))
862 precision = (precision*10) + *f++ - '0';
863 }
864 /* handle the long flag, but only for %ld and %lu.
865 others can be added when necessary. */
866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867 longflag = 1;
868 ++f;
869 }
870 /* handle the size_t flag. */
871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872 size_tflag = 1;
873 ++f;
874 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 switch (*f) {
877 case 'c':
878 *s++ = va_arg(vargs, int);
879 break;
880 case 'd':
881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
882 if (longflag)
883 sprintf(realbuffer, fmt, va_arg(vargs, long));
884 else if (size_tflag)
885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
886 else
887 sprintf(realbuffer, fmt, va_arg(vargs, int));
888 appendstring(realbuffer);
889 break;
890 case 'u':
891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
892 if (longflag)
893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
894 else if (size_tflag)
895 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
896 else
897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898 appendstring(realbuffer);
899 break;
900 case 'i':
901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
904 break;
905 case 'x':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 's':
911 {
912 /* Parameter must be UTF-8 encoded.
913 In case of encoding errors, use
914 the replacement character. */
915 PyObject *u;
916 p = va_arg(vargs, char*);
917 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 "replace");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000919 if (!u)
920 goto fail;
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Peterson29060642009-01-31 22:14:21 +0000922 PyUnicode_GET_SIZE(u));
Benjamin Peterson14339b62009-01-31 16:36:08 +0000923 s += PyUnicode_GET_SIZE(u);
924 Py_DECREF(u);
925 break;
926 }
927 case 'U':
928 {
929 PyObject *obj = va_arg(vargs, PyObject *);
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 break;
934 }
935 case 'V':
936 {
937 PyObject *obj = va_arg(vargs, PyObject *);
938 const char *str = va_arg(vargs, const char *);
939 if (obj) {
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 } else {
944 appendstring(str);
945 }
946 break;
947 }
948 case 'S':
949 case 'R':
950 {
951 Py_UNICODE *ucopy;
952 Py_ssize_t usize;
953 Py_ssize_t upos;
954 /* unused, since we already have the result */
955 (void) va_arg(vargs, PyObject *);
956 ucopy = PyUnicode_AS_UNICODE(*callresult);
957 usize = PyUnicode_GET_SIZE(*callresult);
958 for (upos = 0; upos<usize;)
959 *s++ = ucopy[upos++];
960 /* We're done with the unicode()/repr() => forget it */
961 Py_DECREF(*callresult);
962 /* switch to next unicode()/repr() result */
963 ++callresult;
964 break;
965 }
966 case 'p':
967 sprintf(buffer, "%p", va_arg(vargs, void*));
968 /* %p is ill-defined: ensure leading 0x. */
969 if (buffer[1] == 'X')
970 buffer[1] = 'x';
971 else if (buffer[1] != 'x') {
972 memmove(buffer+2, buffer, strlen(buffer)+1);
973 buffer[0] = '0';
974 buffer[1] = 'x';
975 }
976 appendstring(buffer);
977 break;
978 case '%':
979 *s++ = '%';
980 break;
981 default:
982 appendstring(p);
983 goto end;
984 }
985 } else
986 *s++ = *f;
987 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000988
Benjamin Peterson29060642009-01-31 22:14:21 +0000989 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 if (callresults)
991 PyObject_Free(callresults);
992 if (abuffer)
993 PyObject_Free(abuffer);
994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
995 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +0000996 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 if (callresults) {
998 PyObject **callresult2 = callresults;
999 while (callresult2 < callresult) {
1000 Py_DECREF(*callresult2);
1001 ++callresult2;
1002 }
1003 PyObject_Free(callresults);
1004 }
1005 if (abuffer)
1006 PyObject_Free(abuffer);
1007 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 PyObject* ret;
1016 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
1018#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001019 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001022#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001023 ret = PyUnicode_FromFormatV(format, vargs);
1024 va_end(vargs);
1025 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001026}
1027
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001029 wchar_t *w,
1030 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031{
1032 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001033 PyErr_BadInternalCall();
1034 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001036
1037 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001039 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001040
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041#ifdef HAVE_USABLE_WCHAR_T
1042 memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001045 register Py_UNICODE *u;
1046 register Py_ssize_t i;
1047 u = PyUnicode_AS_UNICODE(unicode);
1048 for (i = size; i > 0; i--)
1049 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 }
1051#endif
1052
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001053 if (size > PyUnicode_GET_SIZE(unicode))
1054 return PyUnicode_GET_SIZE(unicode);
1055 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001056 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057}
1058
1059#endif
1060
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001066 PyErr_SetString(PyExc_ValueError,
1067 "chr() arg not in range(0x110000)");
1068 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001069 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001070
1071#ifndef Py_UNICODE_WIDE
1072 if (ordinal > 0xffff) {
1073 ordinal -= 0x10000;
1074 s[0] = 0xD800 | (ordinal >> 10);
1075 s[1] = 0xDC00 | (ordinal & 0x3FF);
1076 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#endif
1079
Hye-Shik Chang40574832004-04-06 07:24:51 +00001080 s[0] = (Py_UNICODE)ordinal;
1081 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001082}
1083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001089 Py_INCREF(obj);
1090 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001091 }
1092 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001093 /* For a Unicode subtype that's not a Unicode object,
1094 return a true Unicode object with the same data. */
1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001097 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001098 PyErr_Format(PyExc_TypeError,
1099 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001100 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001101 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001105 const char *encoding,
1106 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001113 PyErr_BadInternalCall();
1114 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001117 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 PyErr_SetString(PyExc_TypeError,
1119 "decoding str is not supported");
1120 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122
1123 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001124 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001125 s = PyBytes_AS_STRING(obj);
1126 len = PyBytes_GET_SIZE(obj);
1127 }
1128 else if (PyByteArray_Check(obj)) {
1129 s = PyByteArray_AS_STRING(obj);
1130 len = PyByteArray_GET_SIZE(obj);
1131 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001132 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001133 /* Overwrite the error message with something more useful in
1134 case of a TypeError. */
1135 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001137 "coercing to str: need string or buffer, "
1138 "%.80s found",
1139 Py_TYPE(obj)->tp_name);
1140 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001141 }
Tim Petersced69f82003-09-16 20:30:58 +00001142
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001143 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 Py_INCREF(unicode_empty);
1146 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 }
Tim Petersced69f82003-09-16 20:30:58 +00001148 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001150
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001151 return v;
1152
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 Py_ssize_t size,
1159 const char *encoding,
1160 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161{
1162 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001163 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 char lower[20]; /* Enough for any encoding name we recognize */
1165 char *l;
1166 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
1168 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 encoding = PyUnicode_GetDefaultEncoding();
1170
1171 /* Convert encoding to lower case and replace '_' with '-' in order to
1172 catch e.g. UTF_8 */
1173 e = encoding;
1174 l = lower;
1175 while (*e && l < &lower[(sizeof lower) - 2]) {
1176 if (ISUPPER(*e)) {
1177 *l++ = TOLOWER(*e++);
1178 }
1179 else if (*e == '_') {
1180 *l++ = '-';
1181 e++;
1182 }
1183 else {
1184 *l++ = *e++;
1185 }
1186 }
1187 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001188
1189 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if ((strcmp(lower, "latin-1") == 0) ||
1193 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001196 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001201 else if (strcmp(lower, "utf-16") == 0)
1202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203 else if (strcmp(lower, "utf-32") == 0)
1204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (buffer == NULL)
1212 goto onError;
1213 unicode = PyCodec_Decode(buffer, encoding, errors);
1214 if (unicode == NULL)
1215 goto onError;
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_DECREF(unicode);
1221 goto onError;
1222 }
1223 Py_DECREF(buffer);
1224 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001225
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 Py_XDECREF(buffer);
1228 return NULL;
1229}
1230
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v;
1236
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241
1242 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001243 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001244
1245 /* Decode via the codec registry */
1246 v = PyCodec_Decode(unicode, encoding, errors);
1247 if (v == NULL)
1248 goto onError;
1249 return v;
1250
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001252 return NULL;
1253}
1254
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256 const char *encoding,
1257 const char *errors)
1258{
1259 PyObject *v;
1260
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 goto onError;
1264 }
1265
1266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001268
1269 /* Decode via the codec registry */
1270 v = PyCodec_Decode(unicode, encoding, errors);
1271 if (v == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(v)) {
1274 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001275 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001276 Py_TYPE(v)->tp_name);
1277 Py_DECREF(v);
1278 goto onError;
1279 }
1280 return v;
1281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001283 return NULL;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001287 Py_ssize_t size,
1288 const char *encoding,
1289 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290{
1291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 unicode = PyUnicode_FromUnicode(s, size);
1294 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297 Py_DECREF(unicode);
1298 return v;
1299}
1300
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302 const char *encoding,
1303 const char *errors)
1304{
1305 PyObject *v;
1306
1307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadArgument();
1309 goto onError;
1310 }
1311
1312 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001313 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 return v;
1320
Benjamin Peterson29060642009-01-31 22:14:21 +00001321 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001322 return NULL;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326 const char *encoding,
1327 const char *errors)
1328{
1329 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (!PyUnicode_Check(unicode)) {
1332 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 }
Fred Drakee4315f52000-05-09 19:53:39 +00001335
Tim Petersced69f82003-09-16 20:30:58 +00001336 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001337 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001338
1339 /* Shortcuts for common default encodings */
1340 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001341 if (strcmp(encoding, "utf-8") == 0)
1342 return PyUnicode_AsUTF8String(unicode);
1343 else if (strcmp(encoding, "latin-1") == 0)
1344 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001346 else if (strcmp(encoding, "mbcs") == 0)
1347 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001348#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001349 else if (strcmp(encoding, "ascii") == 0)
1350 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001351 /* During bootstrap, we may need to find the encodings
1352 package, to load the file system encoding, and require the
1353 file system encoding in order to load the encodings
1354 package.
1355
1356 Break out of this dependency by assuming that the path to
1357 the encodings module is ASCII-only. XXX could try wcstombs
1358 instead, if the file system encoding is the locale's
1359 encoding. */
1360 else if (Py_FileSystemDefaultEncoding &&
1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001363 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
1366 /* Encode via the codec registry */
1367 v = PyCodec_Encode(unicode, encoding, errors);
1368 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001369 return NULL;
1370
1371 /* The normal path */
1372 if (PyBytes_Check(v))
1373 return v;
1374
1375 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376 if (PyByteArray_Check(v)) {
1377 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001378 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001379 PyOS_snprintf(msg, sizeof(msg),
1380 "encoder %s returned buffer instead of bytes",
1381 encoding);
1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001383 Py_DECREF(v);
1384 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001385 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388 Py_DECREF(v);
1389 return b;
1390 }
1391
1392 PyErr_Format(PyExc_TypeError,
1393 "encoder did not return a bytes object (type=%.400s)",
1394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001411 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001412
1413 /* Encode via the codec registry */
1414 v = PyCodec_Encode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427 return NULL;
1428}
1429
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001432{
1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001434 if (v)
1435 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001436 if (errors != NULL)
1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001439 PyUnicode_GET_SIZE(unicode),
1440 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001441 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001442 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001443 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001444 return v;
1445}
1446
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001448PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001450 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001452
Christian Heimes5894ba72007-11-04 11:43:14 +00001453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457 can be undefined. If it is case, decode using UTF-8. The following assumes
1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459 bootstrapping process where the codecs aren't ready yet.
1460 */
1461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001464 return PyUnicode_DecodeMBCS(s, size, "replace");
1465 }
1466#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001468 return PyUnicode_DecodeUTF8(s, size, "replace");
1469 }
1470#endif
1471 return PyUnicode_Decode(s, size,
1472 Py_FileSystemDefaultEncoding,
1473 "replace");
1474 }
1475 else {
1476 return PyUnicode_DecodeUTF8(s, size, "replace");
1477 }
1478}
1479
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001482{
Christian Heimesf3863112007-11-22 07:46:41 +00001483 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
1486 return NULL;
1487 }
Christian Heimesf3863112007-11-22 07:46:41 +00001488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001490 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001491 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001492 *psize = PyBytes_GET_SIZE(bytes);
1493 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001494}
1495
1496char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001498{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001499 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001500}
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 goto onError;
1507 }
1508 return PyUnicode_AS_UNICODE(unicode);
1509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 return NULL;
1512}
1513
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515{
1516 if (!PyUnicode_Check(unicode)) {
1517 PyErr_BadArgument();
1518 goto onError;
1519 }
1520 return PyUnicode_GET_SIZE(unicode);
1521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 return -1;
1524}
1525
Thomas Wouters78890102000-07-22 19:25:51 +00001526const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001527{
1528 return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001533 if (strcmp(encoding, unicode_default_encoding) != 0) {
1534 PyErr_Format(PyExc_ValueError,
1535 "Can only set default encoding to %s",
1536 unicode_default_encoding);
1537 return -1;
1538 }
Fred Drakee4315f52000-05-09 19:53:39 +00001539 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001540}
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542/* error handling callback helper:
1543 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001544 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 and adjust various state variables.
1546 return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001551 const char *encoding, const char *reason,
1552 const char **input, const char **inend, Py_ssize_t *startinpos,
1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557
1558 PyObject *restuple = NULL;
1559 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t requiredsize;
1563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001565 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 int res = -1;
1568
1569 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 *errorHandler = PyCodec_LookupError(errors);
1571 if (*errorHandler == NULL)
1572 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001573 }
1574
1575 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001576 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1578 if (*exceptionObject == NULL)
1579 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001580 }
1581 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585 goto onError;
1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001588 }
1589
1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001592 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001594 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 }
1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
1600 /* Copy back the bytes variables, which might have been modified by the
1601 callback */
1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603 if (!inputobj)
1604 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001605 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001608 *input = PyBytes_AS_STRING(inputobj);
1609 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001610 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001611 /* we can DECREF safely, as the exception has another reference,
1612 so the object won't go away. */
1613 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001616 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001617 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1619 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 /* need more space? (at least enough for what we
1623 have+the replacement+the rest of the string (starting
1624 at the new input position), so we won't have to check space
1625 when there are no errors in the rest of the string) */
1626 repptr = PyUnicode_AS_UNICODE(repunicode);
1627 repsize = PyUnicode_GET_SIZE(repunicode);
1628 requiredsize = *outpos + repsize + insize-newpos;
1629 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001630 if (requiredsize<2*outsize)
1631 requiredsize = 2*outsize;
1632 if (_PyUnicode_Resize(output, requiredsize) < 0)
1633 goto onError;
1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 }
1636 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001637 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_UNICODE_COPY(*outptr, repptr, repsize);
1639 *outptr += repsize;
1640 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 /* we made it! */
1643 res = 0;
1644
Benjamin Peterson29060642009-01-31 22:14:21 +00001645 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 Py_XDECREF(restuple);
1647 return res;
1648}
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
Tim Petersced69f82003-09-16 20:30:58 +00001654static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655char utf7_special[128] = {
1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657 encoded:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001658 0 - not special
1659 1 - special
1660 2 - whitespace (optional)
1661 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674 warnings about the comparison always being false; since
1675 utf7_special[0] is 1, we can safely make that one comparison
1676 true */
1677
Benjamin Peterson29060642009-01-31 22:14:21 +00001678#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson29060642009-01-31 22:14:21 +00001680 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 (encodeO && (utf7_special[(c)] == 3)))
1682
Benjamin Peterson29060642009-01-31 22:14:21 +00001683#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson29060642009-01-31 22:14:21 +00001685#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001686 (ISALNUM(c) || (c) == '+' || (c) == '/')
Benjamin Peterson29060642009-01-31 22:14:21 +00001687#define UB64(c) \
1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001691#define ENCODE(out, ch, bits) \
1692 while (bits >= 6) { \
1693 *out++ = B64(ch >> (bits-6)); \
1694 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001697#define DECODE(out, ch, bits, surrogate) \
1698 while (bits >= 16) { \
1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1700 bits -= 16; \
1701 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001702 /* We have already generated an error for the high surrogate \
1703 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001704 surrogate = 0; \
1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001707 it in a 16-bit character */ \
1708 surrogate = 1; \
1709 errmsg = "code pairs are not supported"; \
1710 goto utf7Error; \
1711 } else { \
1712 *out++ = outCh; \
1713 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 Py_ssize_t size,
1718 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001724 Py_ssize_t size,
1725 const char *errors,
1726 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729 Py_ssize_t startinpos;
1730 Py_ssize_t endinpos;
1731 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 const char *e;
1733 PyUnicodeObject *unicode;
1734 Py_UNICODE *p;
1735 const char *errmsg = "";
1736 int inShift = 0;
1737 unsigned int bitsleft = 0;
1738 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int surrogate = 0;
1740 PyObject *errorHandler = NULL;
1741 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
1743 unicode = _PyUnicode_New(size);
1744 if (!unicode)
1745 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001746 if (size == 0) {
1747 if (consumed)
1748 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
1752 p = unicode->str;
1753 e = s + size;
1754
1755 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001757 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001758 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
1760 if (inShift) {
1761 if ((ch == '-') || !B64CHAR(ch)) {
1762 inShift = 0;
1763 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766 if (bitsleft >= 6) {
1767 /* The shift sequence has a partial character in it. If
1768 bitsleft < 6 then we could just classify it as padding
1769 but that is not the case here */
1770
1771 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001772 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 }
1774 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001775 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 here so indicate the potential of a misencoded character. */
1777
1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001781 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
1783
1784 if (ch == '-') {
1785 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001786 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 inShift = 1;
1788 }
1789 } else if (SPECIAL(ch,0,0)) {
1790 errmsg = "unexpected special character";
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 } else {
1793 *p++ = ch;
1794 }
1795 } else {
1796 charsleft = (charsleft << 6) | UB64(ch);
1797 bitsleft += 6;
1798 s++;
1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800 }
1801 }
1802 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 s++;
1805 if (s < e && *s == '-') {
1806 s++;
1807 *p++ = '+';
1808 } else
1809 {
1810 inShift = 1;
1811 bitsleft = 0;
1812 }
1813 }
1814 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001815 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 errmsg = "unexpected special character";
1817 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001818 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820 else {
1821 *p++ = ch;
1822 s++;
1823 }
1824 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 outpos = p-PyUnicode_AS_UNICODE(unicode);
1827 endinpos = s-starts;
1828 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001829 errors, &errorHandler,
1830 "utf7", errmsg,
1831 &starts, &e, &startinpos, &endinpos, &exc, &s,
1832 &unicode, &outpos, &p))
1833 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 }
1835
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001836 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 outpos = p-PyUnicode_AS_UNICODE(unicode);
1838 endinpos = size;
1839 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 errors, &errorHandler,
1841 "utf7", "unterminated shift sequence",
1842 &starts, &e, &startinpos, &endinpos, &exc, &s,
1843 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 if (s < e)
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001848 if (consumed) {
1849 if(inShift)
1850 *consumed = startinpos;
1851 else
1852 *consumed = s-starts;
1853 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 return (PyObject *)unicode;
1861
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 Py_ssize_t size,
1872 int encodeSetO,
1873 int encodeWhiteSpace,
1874 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001875{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001876 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 unsigned int bitsleft = 0;
1882 unsigned long charsleft = 0;
1883 char * out;
1884 char * start;
1885
1886 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001887 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001889 if (cbAllocated / 5 != size)
1890 return PyErr_NoMemory();
1891
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001892 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (v == NULL)
1894 return NULL;
1895
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001896 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 for (;i < size; ++i) {
1898 Py_UNICODE ch = s[i];
1899
1900 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001901 if (ch == '+') {
1902 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *out++ = '-';
1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905 charsleft = ch;
1906 bitsleft = 16;
1907 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 } else {
1911 *out++ = (char) ch;
1912 }
1913 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915 *out++ = B64(charsleft << (6-bitsleft));
1916 charsleft = 0;
1917 bitsleft = 0;
1918 /* Characters not in the BASE64 set implicitly unshift the sequence
1919 so no '-' is required, except if the character is itself a '-' */
1920 if (B64CHAR(ch) || ch == '-') {
1921 *out++ = '-';
1922 }
1923 inShift = 0;
1924 *out++ = (char) ch;
1925 } else {
1926 bitsleft += 16;
1927 charsleft = (charsleft << 16) | ch;
1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001931 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 or '-' then the shift sequence will be terminated implicitly and we
1933 don't have to insert a '-'. */
1934
1935 if (bitsleft == 0) {
1936 if (i + 1 < size) {
1937 Py_UNICODE ch2 = s[i+1];
1938
1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 } else if (B64CHAR(ch2) || ch2 == '-') {
1942 *out++ = '-';
1943 inShift = 0;
1944 } else {
1945 inShift = 0;
1946 }
1947
1948 }
1949 else {
1950 *out++ = '-';
1951 inShift = 0;
1952 }
1953 }
Tim Petersced69f82003-09-16 20:30:58 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (bitsleft) {
1958 *out++= B64(charsleft << (6-bitsleft) );
1959 *out++ = '-';
1960 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001961 if (_PyBytes_Resize(&v, out - start) < 0)
1962 return NULL;
1963 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001964}
1965
1966#undef SPECIAL
1967#undef B64
1968#undef B64CHAR
1969#undef UB64
1970#undef ENCODE
1971#undef DECODE
1972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973/* --- UTF-8 Codec -------------------------------------------------------- */
1974
Tim Petersced69f82003-09-16 20:30:58 +00001975static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976char utf8_code_length[256] = {
1977 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1978 illegal prefix. see RFC 2279 for details */
1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1994 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1995};
1996
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001998 Py_ssize_t size,
1999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000{
Walter Dörwald69652032004-09-07 20:24:22 +00002001 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2002}
2003
Antoine Pitrouab868312009-01-10 15:40:25 +00002004/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006
2007/* Mask to quickly check whether a C 'long' contains a
2008 non-ASCII, UTF8-encoded char. */
2009#if (SIZEOF_LONG == 8)
2010# define ASCII_CHAR_MASK 0x8080808080808080L
2011#elif (SIZEOF_LONG == 4)
2012# define ASCII_CHAR_MASK 0x80808080L
2013#else
2014# error C 'long' size should be either 4 or 8!
2015#endif
2016
Walter Dörwald69652032004-09-07 20:24:22 +00002017PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002018 Py_ssize_t size,
2019 const char *errors,
2020 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002021{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002024 Py_ssize_t startinpos;
2025 Py_ssize_t endinpos;
2026 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002027 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 PyUnicodeObject *unicode;
2029 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002030 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 PyObject *errorHandler = NULL;
2032 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
2034 /* Note: size will always be longer than the resulting Unicode
2035 character count */
2036 unicode = _PyUnicode_New(size);
2037 if (!unicode)
2038 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002039 if (size == 0) {
2040 if (consumed)
2041 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Unpack UTF-8 encoded data */
2046 p = unicode->str;
2047 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002048 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
2050 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002051 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052
2053 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002054 /* Fast path for runs of ASCII characters. Given that common UTF-8
2055 input will consist of an overwhelming majority of ASCII
2056 characters, we try to optimize for this case by checking
2057 as many characters as a C 'long' can contain.
2058 First, check if we can do an aligned read, as most CPUs have
2059 a penalty for unaligned reads.
2060 */
2061 if (!((size_t) s & LONG_PTR_MASK)) {
2062 /* Help register allocation */
2063 register const char *_s = s;
2064 register Py_UNICODE *_p = p;
2065 while (_s < aligned_end) {
2066 /* Read a whole long at a time (either 4 or 8 bytes),
2067 and do a fast unrolled copy if it only contains ASCII
2068 characters. */
2069 unsigned long data = *(unsigned long *) _s;
2070 if (data & ASCII_CHAR_MASK)
2071 break;
2072 _p[0] = (unsigned char) _s[0];
2073 _p[1] = (unsigned char) _s[1];
2074 _p[2] = (unsigned char) _s[2];
2075 _p[3] = (unsigned char) _s[3];
2076#if (SIZEOF_LONG == 8)
2077 _p[4] = (unsigned char) _s[4];
2078 _p[5] = (unsigned char) _s[5];
2079 _p[6] = (unsigned char) _s[6];
2080 _p[7] = (unsigned char) _s[7];
2081#endif
2082 _s += SIZEOF_LONG;
2083 _p += SIZEOF_LONG;
2084 }
2085 s = _s;
2086 p = _p;
2087 if (s == e)
2088 break;
2089 ch = (unsigned char)*s;
2090 }
2091 }
2092
2093 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002094 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 s++;
2096 continue;
2097 }
2098
2099 n = utf8_code_length[ch];
2100
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002101 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 if (consumed)
2103 break;
2104 else {
2105 errmsg = "unexpected end of data";
2106 startinpos = s-starts;
2107 endinpos = size;
2108 goto utf8Error;
2109 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111
2112 switch (n) {
2113
2114 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002115 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 startinpos = s-starts;
2117 endinpos = startinpos+1;
2118 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119
2120 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002121 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 startinpos = s-starts;
2123 endinpos = startinpos+1;
2124 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125
2126 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002127 if ((s[1] & 0xc0) != 0x80) {
2128 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002129 startinpos = s-starts;
2130 endinpos = startinpos+2;
2131 goto utf8Error;
2132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002135 startinpos = s-starts;
2136 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002138 goto utf8Error;
2139 }
2140 else
2141 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 break;
2143
2144 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002145 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002146 (s[2] & 0xc0) != 0x80) {
2147 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002148 startinpos = s-starts;
2149 endinpos = startinpos+3;
2150 goto utf8Error;
2151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002153 if (ch < 0x0800) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002154 /* Note: UTF-8 encodings of surrogates are considered
2155 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002156
Benjamin Peterson29060642009-01-31 22:14:21 +00002157 XXX For wide builds (UCS-4) we should probably try
2158 to recombine the surrogates into a single code
2159 unit.
2160 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002161 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002162 startinpos = s-starts;
2163 endinpos = startinpos+3;
2164 goto utf8Error;
2165 }
2166 else
2167 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002168 break;
2169
2170 case 4:
2171 if ((s[1] & 0xc0) != 0x80 ||
2172 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002173 (s[3] & 0xc0) != 0x80) {
2174 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 startinpos = s-starts;
2176 endinpos = startinpos+4;
2177 goto utf8Error;
2178 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002179 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002181 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002182 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002183 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002184 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002185 UTF-16 */
2186 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002187 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 startinpos = s-starts;
2189 endinpos = startinpos+4;
2190 goto utf8Error;
2191 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002192#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002193 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002194#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002195 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002196
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002197 /* translate from 10000..10FFFF to 0..FFFF */
2198 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002199
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002200 /* high surrogate = top 10 bits added to D800 */
2201 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002202
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002203 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002204 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002205#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 break;
2207
2208 default:
2209 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002210 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 startinpos = s-starts;
2212 endinpos = startinpos+n;
2213 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 }
2215 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002216 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002217
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 utf8Error:
2219 outpos = p-PyUnicode_AS_UNICODE(unicode);
2220 if (unicode_decode_call_errorhandler(
2221 errors, &errorHandler,
2222 "utf8", errmsg,
2223 &starts, &e, &startinpos, &endinpos, &exc, &s,
2224 &unicode, &outpos, &p))
2225 goto onError;
2226 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 }
Walter Dörwald69652032004-09-07 20:24:22 +00002228 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002229 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230
2231 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002232 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 goto onError;
2234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 Py_XDECREF(errorHandler);
2236 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 return (PyObject *)unicode;
2238
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 Py_XDECREF(errorHandler);
2241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 Py_DECREF(unicode);
2243 return NULL;
2244}
2245
Antoine Pitrouab868312009-01-10 15:40:25 +00002246#undef ASCII_CHAR_MASK
2247
2248
Tim Peters602f7402002-04-27 18:03:26 +00002249/* Allocation strategy: if the string is short, convert into a stack buffer
2250 and allocate exactly as much space needed at the end. Else allocate the
2251 maximum possible needed (4 result bytes per Unicode character), and return
2252 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002253*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002254PyObject *
2255PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002256 Py_ssize_t size,
2257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258{
Tim Peters602f7402002-04-27 18:03:26 +00002259#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002260
Guido van Rossum98297ee2007-11-06 21:34:58 +00002261 Py_ssize_t i; /* index into s of next input byte */
2262 PyObject *result; /* result string object */
2263 char *p; /* next free byte in output buffer */
2264 Py_ssize_t nallocated; /* number of result bytes allocated */
2265 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002266 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002267
Tim Peters602f7402002-04-27 18:03:26 +00002268 assert(s != NULL);
2269 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270
Tim Peters602f7402002-04-27 18:03:26 +00002271 if (size <= MAX_SHORT_UNICHARS) {
2272 /* Write into the stack buffer; nallocated can't overflow.
2273 * At the end, we'll allocate exactly as much heap space as it
2274 * turns out we need.
2275 */
2276 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002277 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002278 p = stackbuf;
2279 }
2280 else {
2281 /* Overallocate on the heap, and give the excess back at the end. */
2282 nallocated = size * 4;
2283 if (nallocated / 4 != size) /* overflow! */
2284 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002285 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002286 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002287 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002288 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002289 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002290
Tim Peters602f7402002-04-27 18:03:26 +00002291 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002292 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002293
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002294 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002295 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002297
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002299 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002300 *p++ = (char)(0xc0 | (ch >> 6));
2301 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002302 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002303 else {
Tim Peters602f7402002-04-27 18:03:26 +00002304 /* Encode UCS2 Unicode ordinals */
2305 if (ch < 0x10000) {
2306 /* Special case: check for high surrogate */
2307 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2308 Py_UCS4 ch2 = s[i];
2309 /* Check for low surrogate and combine the two to
2310 form a UCS4 value */
2311 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002312 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002313 i++;
2314 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002315 }
Tim Peters602f7402002-04-27 18:03:26 +00002316 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002317 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002318 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002319 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2320 *p++ = (char)(0x80 | (ch & 0x3f));
2321 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002323 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002324 /* Encode UCS4 Unicode ordinals */
2325 *p++ = (char)(0xf0 | (ch >> 18));
2326 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2327 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2328 *p++ = (char)(0x80 | (ch & 0x3f));
2329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002331
Guido van Rossum98297ee2007-11-06 21:34:58 +00002332 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002333 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002334 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002335 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002336 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002337 }
2338 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002339 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002340 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002341 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002342 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002343 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002344 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002345
Tim Peters602f7402002-04-27 18:03:26 +00002346#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347}
2348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 if (!PyUnicode_Check(unicode)) {
2352 PyErr_BadArgument();
2353 return NULL;
2354 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002355 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 PyUnicode_GET_SIZE(unicode),
2357 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358}
2359
Walter Dörwald41980ca2007-08-16 21:55:45 +00002360/* --- UTF-32 Codec ------------------------------------------------------- */
2361
2362PyObject *
2363PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 Py_ssize_t size,
2365 const char *errors,
2366 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002367{
2368 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2369}
2370
2371PyObject *
2372PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002373 Py_ssize_t size,
2374 const char *errors,
2375 int *byteorder,
2376 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002377{
2378 const char *starts = s;
2379 Py_ssize_t startinpos;
2380 Py_ssize_t endinpos;
2381 Py_ssize_t outpos;
2382 PyUnicodeObject *unicode;
2383 Py_UNICODE *p;
2384#ifndef Py_UNICODE_WIDE
2385 int i, pairs;
2386#else
2387 const int pairs = 0;
2388#endif
2389 const unsigned char *q, *e;
2390 int bo = 0; /* assume native ordering by default */
2391 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002392 /* Offsets from q for retrieving bytes in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394 int iorder[] = {0, 1, 2, 3};
2395#else
2396 int iorder[] = {3, 2, 1, 0};
2397#endif
2398 PyObject *errorHandler = NULL;
2399 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002400 /* On narrow builds we split characters outside the BMP into two
2401 codepoints => count how much extra space we need. */
2402#ifndef Py_UNICODE_WIDE
2403 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002404 if (((Py_UCS4 *)s)[i] >= 0x10000)
2405 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002406#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002407
2408 /* This might be one to much, because of a BOM */
2409 unicode = _PyUnicode_New((size+3)/4+pairs);
2410 if (!unicode)
2411 return NULL;
2412 if (size == 0)
2413 return (PyObject *)unicode;
2414
2415 /* Unpack UTF-32 encoded data */
2416 p = unicode->str;
2417 q = (unsigned char *)s;
2418 e = q + size;
2419
2420 if (byteorder)
2421 bo = *byteorder;
2422
2423 /* Check for BOM marks (U+FEFF) in the input and adjust current
2424 byte order setting accordingly. In native mode, the leading BOM
2425 mark is skipped, in all other modes, it is copied to the output
2426 stream as-is (giving a ZWNBSP character). */
2427 if (bo == 0) {
2428 if (size >= 4) {
2429 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 if (bom == 0x0000FEFF) {
2433 q += 4;
2434 bo = -1;
2435 }
2436 else if (bom == 0xFFFE0000) {
2437 q += 4;
2438 bo = 1;
2439 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002440#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002441 if (bom == 0x0000FEFF) {
2442 q += 4;
2443 bo = 1;
2444 }
2445 else if (bom == 0xFFFE0000) {
2446 q += 4;
2447 bo = -1;
2448 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002449#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002451 }
2452
2453 if (bo == -1) {
2454 /* force LE */
2455 iorder[0] = 0;
2456 iorder[1] = 1;
2457 iorder[2] = 2;
2458 iorder[3] = 3;
2459 }
2460 else if (bo == 1) {
2461 /* force BE */
2462 iorder[0] = 3;
2463 iorder[1] = 2;
2464 iorder[2] = 1;
2465 iorder[3] = 0;
2466 }
2467
2468 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002469 Py_UCS4 ch;
2470 /* remaining bytes at the end? (size should be divisible by 4) */
2471 if (e-q<4) {
2472 if (consumed)
2473 break;
2474 errmsg = "truncated data";
2475 startinpos = ((const char *)q)-starts;
2476 endinpos = ((const char *)e)-starts;
2477 goto utf32Error;
2478 /* The remaining input chars are ignored if the callback
2479 chooses to skip the input */
2480 }
2481 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2482 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483
Benjamin Peterson29060642009-01-31 22:14:21 +00002484 if (ch >= 0x110000)
2485 {
2486 errmsg = "codepoint not in range(0x110000)";
2487 startinpos = ((const char *)q)-starts;
2488 endinpos = startinpos+4;
2489 goto utf32Error;
2490 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002491#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002492 if (ch >= 0x10000)
2493 {
2494 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2495 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2496 }
2497 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002498#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 *p++ = ch;
2500 q += 4;
2501 continue;
2502 utf32Error:
2503 outpos = p-PyUnicode_AS_UNICODE(unicode);
2504 if (unicode_decode_call_errorhandler(
2505 errors, &errorHandler,
2506 "utf32", errmsg,
2507 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2508 &unicode, &outpos, &p))
2509 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002510 }
2511
2512 if (byteorder)
2513 *byteorder = bo;
2514
2515 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002516 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002517
2518 /* Adjust length */
2519 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2520 goto onError;
2521
2522 Py_XDECREF(errorHandler);
2523 Py_XDECREF(exc);
2524 return (PyObject *)unicode;
2525
Benjamin Peterson29060642009-01-31 22:14:21 +00002526 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002527 Py_DECREF(unicode);
2528 Py_XDECREF(errorHandler);
2529 Py_XDECREF(exc);
2530 return NULL;
2531}
2532
2533PyObject *
2534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 Py_ssize_t size,
2536 const char *errors,
2537 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002538{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002539 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002540 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002541 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002542#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002543 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002544#else
2545 const int pairs = 0;
2546#endif
2547 /* Offsets from p for storing byte pairs in the right order. */
2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2549 int iorder[] = {0, 1, 2, 3};
2550#else
2551 int iorder[] = {3, 2, 1, 0};
2552#endif
2553
Benjamin Peterson29060642009-01-31 22:14:21 +00002554#define STORECHAR(CH) \
2555 do { \
2556 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2557 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2558 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2559 p[iorder[0]] = (CH) & 0xff; \
2560 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002561 } while(0)
2562
2563 /* In narrow builds we can output surrogate pairs as one codepoint,
2564 so we need less space. */
2565#ifndef Py_UNICODE_WIDE
2566 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002567 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2568 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2569 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002570#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002571 nsize = (size - pairs + (byteorder == 0));
2572 bytesize = nsize * 4;
2573 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002574 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002575 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002576 if (v == NULL)
2577 return NULL;
2578
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002579 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002580 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002582 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002583 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002584
2585 if (byteorder == -1) {
2586 /* force LE */
2587 iorder[0] = 0;
2588 iorder[1] = 1;
2589 iorder[2] = 2;
2590 iorder[3] = 3;
2591 }
2592 else if (byteorder == 1) {
2593 /* force BE */
2594 iorder[0] = 3;
2595 iorder[1] = 2;
2596 iorder[2] = 1;
2597 iorder[3] = 0;
2598 }
2599
2600 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002601 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002602#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2604 Py_UCS4 ch2 = *s;
2605 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2606 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2607 s++;
2608 size--;
2609 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002611#endif
2612 STORECHAR(ch);
2613 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002614
2615 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002616 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002617#undef STORECHAR
2618}
2619
2620PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2621{
2622 if (!PyUnicode_Check(unicode)) {
2623 PyErr_BadArgument();
2624 return NULL;
2625 }
2626 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002627 PyUnicode_GET_SIZE(unicode),
2628 NULL,
2629 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002630}
2631
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632/* --- UTF-16 Codec ------------------------------------------------------- */
2633
Tim Peters772747b2001-08-09 22:21:55 +00002634PyObject *
2635PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002636 Py_ssize_t size,
2637 const char *errors,
2638 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639{
Walter Dörwald69652032004-09-07 20:24:22 +00002640 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2641}
2642
Antoine Pitrouab868312009-01-10 15:40:25 +00002643/* Two masks for fast checking of whether a C 'long' may contain
2644 UTF16-encoded surrogate characters. This is an efficient heuristic,
2645 assuming that non-surrogate characters with a code point >= 0x8000 are
2646 rare in most input.
2647 FAST_CHAR_MASK is used when the input is in native byte ordering,
2648 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002649*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002650#if (SIZEOF_LONG == 8)
2651# define FAST_CHAR_MASK 0x8000800080008000L
2652# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653#elif (SIZEOF_LONG == 4)
2654# define FAST_CHAR_MASK 0x80008000L
2655# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656#else
2657# error C 'long' size should be either 4 or 8!
2658#endif
2659
Walter Dörwald69652032004-09-07 20:24:22 +00002660PyObject *
2661PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 Py_ssize_t size,
2663 const char *errors,
2664 int *byteorder,
2665 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002668 Py_ssize_t startinpos;
2669 Py_ssize_t endinpos;
2670 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *unicode;
2672 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002673 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002674 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002675 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002676 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002677 /* Offsets from q for retrieving byte pairs in the right order. */
2678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2679 int ihi = 1, ilo = 0;
2680#else
2681 int ihi = 0, ilo = 1;
2682#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 PyObject *errorHandler = NULL;
2684 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685
2686 /* Note: size will always be longer than the resulting Unicode
2687 character count */
2688 unicode = _PyUnicode_New(size);
2689 if (!unicode)
2690 return NULL;
2691 if (size == 0)
2692 return (PyObject *)unicode;
2693
2694 /* Unpack UTF-16 encoded data */
2695 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002696 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002697 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002700 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002702 /* Check for BOM marks (U+FEFF) in the input and adjust current
2703 byte order setting accordingly. In native mode, the leading BOM
2704 mark is skipped, in all other modes, it is copied to the output
2705 stream as-is (giving a ZWNBSP character). */
2706 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002707 if (size >= 2) {
2708 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002709#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 if (bom == 0xFEFF) {
2711 q += 2;
2712 bo = -1;
2713 }
2714 else if (bom == 0xFFFE) {
2715 q += 2;
2716 bo = 1;
2717 }
Tim Petersced69f82003-09-16 20:30:58 +00002718#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 if (bom == 0xFEFF) {
2720 q += 2;
2721 bo = 1;
2722 }
2723 else if (bom == 0xFFFE) {
2724 q += 2;
2725 bo = -1;
2726 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002727#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730
Tim Peters772747b2001-08-09 22:21:55 +00002731 if (bo == -1) {
2732 /* force LE */
2733 ihi = 1;
2734 ilo = 0;
2735 }
2736 else if (bo == 1) {
2737 /* force BE */
2738 ihi = 0;
2739 ilo = 1;
2740 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742 native_ordering = ilo < ihi;
2743#else
2744 native_ordering = ilo > ihi;
2745#endif
Tim Peters772747b2001-08-09 22:21:55 +00002746
Antoine Pitrouab868312009-01-10 15:40:25 +00002747 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002748 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002750 /* First check for possible aligned read of a C 'long'. Unaligned
2751 reads are more expensive, better to defer to another iteration. */
2752 if (!((size_t) q & LONG_PTR_MASK)) {
2753 /* Fast path for runs of non-surrogate chars. */
2754 register const unsigned char *_q = q;
2755 Py_UNICODE *_p = p;
2756 if (native_ordering) {
2757 /* Native ordering is simple: as long as the input cannot
2758 possibly contain a surrogate char, do an unrolled copy
2759 of several 16-bit code points to the target object.
2760 The non-surrogate check is done on several input bytes
2761 at a time (as many as a C 'long' can contain). */
2762 while (_q < aligned_end) {
2763 unsigned long data = * (unsigned long *) _q;
2764 if (data & FAST_CHAR_MASK)
2765 break;
2766 _p[0] = ((unsigned short *) _q)[0];
2767 _p[1] = ((unsigned short *) _q)[1];
2768#if (SIZEOF_LONG == 8)
2769 _p[2] = ((unsigned short *) _q)[2];
2770 _p[3] = ((unsigned short *) _q)[3];
2771#endif
2772 _q += SIZEOF_LONG;
2773 _p += SIZEOF_LONG / 2;
2774 }
2775 }
2776 else {
2777 /* Byteswapped ordering is similar, but we must decompose
2778 the copy bytewise, and take care of zero'ing out the
2779 upper bytes if the target object is in 32-bit units
2780 (that is, in UCS-4 builds). */
2781 while (_q < aligned_end) {
2782 unsigned long data = * (unsigned long *) _q;
2783 if (data & SWAPPED_FAST_CHAR_MASK)
2784 break;
2785 /* Zero upper bytes in UCS-4 builds */
2786#if (Py_UNICODE_SIZE > 2)
2787 _p[0] = 0;
2788 _p[1] = 0;
2789#if (SIZEOF_LONG == 8)
2790 _p[2] = 0;
2791 _p[3] = 0;
2792#endif
2793#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002794 /* Issue #4916; UCS-4 builds on big endian machines must
2795 fill the two last bytes of each 4-byte unit. */
2796#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2797# define OFF 2
2798#else
2799# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002800#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002801 ((unsigned char *) _p)[OFF + 1] = _q[0];
2802 ((unsigned char *) _p)[OFF + 0] = _q[1];
2803 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2804 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2805#if (SIZEOF_LONG == 8)
2806 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2807 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2808 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2809 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2810#endif
2811#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002812 _q += SIZEOF_LONG;
2813 _p += SIZEOF_LONG / 2;
2814 }
2815 }
2816 p = _p;
2817 q = _q;
2818 if (q >= e)
2819 break;
2820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002821 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822
Benjamin Peterson14339b62009-01-31 16:36:08 +00002823 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002824
2825 if (ch < 0xD800 || ch > 0xDFFF) {
2826 *p++ = ch;
2827 continue;
2828 }
2829
2830 /* UTF-16 code pair: */
2831 if (q > e) {
2832 errmsg = "unexpected end of data";
2833 startinpos = (((const char *)q) - 2) - starts;
2834 endinpos = ((const char *)e) + 1 - starts;
2835 goto utf16Error;
2836 }
2837 if (0xD800 <= ch && ch <= 0xDBFF) {
2838 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2839 q += 2;
2840 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002841#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 *p++ = ch;
2843 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002844#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002845 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002846#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 continue;
2848 }
2849 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002850 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 startinpos = (((const char *)q)-4)-starts;
2852 endinpos = startinpos+2;
2853 goto utf16Error;
2854 }
2855
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002857 errmsg = "illegal encoding";
2858 startinpos = (((const char *)q)-2)-starts;
2859 endinpos = startinpos+2;
2860 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002861
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 utf16Error:
2863 outpos = p - PyUnicode_AS_UNICODE(unicode);
2864 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002865 errors,
2866 &errorHandler,
2867 "utf16", errmsg,
2868 &starts,
2869 (const char **)&e,
2870 &startinpos,
2871 &endinpos,
2872 &exc,
2873 (const char **)&q,
2874 &unicode,
2875 &outpos,
2876 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002879 /* remaining byte at the end? (size should be even) */
2880 if (e == q) {
2881 if (!consumed) {
2882 errmsg = "truncated data";
2883 startinpos = ((const char *)q) - starts;
2884 endinpos = ((const char *)e) + 1 - starts;
2885 outpos = p - PyUnicode_AS_UNICODE(unicode);
2886 if (unicode_decode_call_errorhandler(
2887 errors,
2888 &errorHandler,
2889 "utf16", errmsg,
2890 &starts,
2891 (const char **)&e,
2892 &startinpos,
2893 &endinpos,
2894 &exc,
2895 (const char **)&q,
2896 &unicode,
2897 &outpos,
2898 &p))
2899 goto onError;
2900 /* The remaining input chars are ignored if the callback
2901 chooses to skip the input */
2902 }
2903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904
2905 if (byteorder)
2906 *byteorder = bo;
2907
Walter Dörwald69652032004-09-07 20:24:22 +00002908 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002910
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002912 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 goto onError;
2914
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915 Py_XDECREF(errorHandler);
2916 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 return (PyObject *)unicode;
2918
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 return NULL;
2924}
2925
Antoine Pitrouab868312009-01-10 15:40:25 +00002926#undef FAST_CHAR_MASK
2927#undef SWAPPED_FAST_CHAR_MASK
2928
Tim Peters772747b2001-08-09 22:21:55 +00002929PyObject *
2930PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 Py_ssize_t size,
2932 const char *errors,
2933 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002935 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002936 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002937 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002938#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002939 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002940#else
2941 const int pairs = 0;
2942#endif
Tim Peters772747b2001-08-09 22:21:55 +00002943 /* Offsets from p for storing byte pairs in the right order. */
2944#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2945 int ihi = 1, ilo = 0;
2946#else
2947 int ihi = 0, ilo = 1;
2948#endif
2949
Benjamin Peterson29060642009-01-31 22:14:21 +00002950#define STORECHAR(CH) \
2951 do { \
2952 p[ihi] = ((CH) >> 8) & 0xff; \
2953 p[ilo] = (CH) & 0xff; \
2954 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002955 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002957#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002958 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002959 if (s[i] >= 0x10000)
2960 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002961#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002962 /* 2 * (size + pairs + (byteorder == 0)) */
2963 if (size > PY_SSIZE_T_MAX ||
2964 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002966 nsize = size + pairs + (byteorder == 0);
2967 bytesize = nsize * 2;
2968 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002970 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 if (v == NULL)
2972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002974 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002976 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002977 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002978 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002979
2980 if (byteorder == -1) {
2981 /* force LE */
2982 ihi = 1;
2983 ilo = 0;
2984 }
2985 else if (byteorder == 1) {
2986 /* force BE */
2987 ihi = 0;
2988 ilo = 1;
2989 }
2990
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002991 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 Py_UNICODE ch = *s++;
2993 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002994#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 if (ch >= 0x10000) {
2996 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2997 ch = 0xD800 | ((ch-0x10000) >> 10);
2998 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002999#endif
Tim Peters772747b2001-08-09 22:21:55 +00003000 STORECHAR(ch);
3001 if (ch2)
3002 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003003 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003004
3005 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003006 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003007#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008}
3009
3010PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3011{
3012 if (!PyUnicode_Check(unicode)) {
3013 PyErr_BadArgument();
3014 return NULL;
3015 }
3016 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 PyUnicode_GET_SIZE(unicode),
3018 NULL,
3019 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020}
3021
3022/* --- Unicode Escape Codec ----------------------------------------------- */
3023
Fredrik Lundh06d12682001-01-24 07:59:11 +00003024static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 Py_ssize_t size,
3028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003031 Py_ssize_t startinpos;
3032 Py_ssize_t endinpos;
3033 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003038 char* message;
3039 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 PyObject *errorHandler = NULL;
3041 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 /* Escaped strings will always be longer than the resulting
3044 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 length after conversion to the true value.
3046 (but if the error callback returns a long replacement string
3047 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 v = _PyUnicode_New(size);
3049 if (v == NULL)
3050 goto onError;
3051 if (size == 0)
3052 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 while (s < end) {
3058 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003059 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061
3062 /* Non-escape characters are interpreted as Unicode ordinals */
3063 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003064 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 continue;
3066 }
3067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 /* \ - Escapes */
3070 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003071 c = *s++;
3072 if (s > end)
3073 c = '\0'; /* Invalid after \ */
3074 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 case '\n': break;
3078 case '\\': *p++ = '\\'; break;
3079 case '\'': *p++ = '\''; break;
3080 case '\"': *p++ = '\"'; break;
3081 case 'b': *p++ = '\b'; break;
3082 case 'f': *p++ = '\014'; break; /* FF */
3083 case 't': *p++ = '\t'; break;
3084 case 'n': *p++ = '\n'; break;
3085 case 'r': *p++ = '\r'; break;
3086 case 'v': *p++ = '\013'; break; /* VT */
3087 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3088
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 case '0': case '1': case '2': case '3':
3091 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003092 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003093 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003094 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003095 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003096 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003098 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 break;
3100
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 /* hex escapes */
3102 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003104 digits = 2;
3105 message = "truncated \\xXX escape";
3106 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003110 digits = 4;
3111 message = "truncated \\uXXXX escape";
3112 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003115 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003116 digits = 8;
3117 message = "truncated \\UXXXXXXXX escape";
3118 hexescape:
3119 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 outpos = p-PyUnicode_AS_UNICODE(v);
3121 if (s+digits>end) {
3122 endinpos = size;
3123 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003124 errors, &errorHandler,
3125 "unicodeescape", "end of string in escape sequence",
3126 &starts, &end, &startinpos, &endinpos, &exc, &s,
3127 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003128 goto onError;
3129 goto nextByte;
3130 }
3131 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003132 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003133 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 endinpos = (s+i+1)-starts;
3135 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003136 errors, &errorHandler,
3137 "unicodeescape", message,
3138 &starts, &end, &startinpos, &endinpos, &exc, &s,
3139 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003140 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003142 }
3143 chr = (chr<<4) & ~0xF;
3144 if (c >= '0' && c <= '9')
3145 chr += c - '0';
3146 else if (c >= 'a' && c <= 'f')
3147 chr += 10 + c - 'a';
3148 else
3149 chr += 10 + c - 'A';
3150 }
3151 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003152 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 /* _decoding_error will have already written into the
3154 target buffer. */
3155 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003156 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003157 /* when we get here, chr is a 32-bit unicode character */
3158 if (chr <= 0xffff)
3159 /* UCS-2 character */
3160 *p++ = (Py_UNICODE) chr;
3161 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003162 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003163 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003164#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003165 *p++ = chr;
3166#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003167 chr -= 0x10000L;
3168 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003169 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003170#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003171 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 endinpos = s-starts;
3173 outpos = p-PyUnicode_AS_UNICODE(v);
3174 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003175 errors, &errorHandler,
3176 "unicodeescape", "illegal Unicode character",
3177 &starts, &end, &startinpos, &endinpos, &exc, &s,
3178 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003179 goto onError;
3180 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003181 break;
3182
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003184 case 'N':
3185 message = "malformed \\N character escape";
3186 if (ucnhash_CAPI == NULL) {
3187 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003188 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003189 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003190 if (m == NULL)
3191 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003192 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003193 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003194 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003195 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003196 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003197 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003198 if (ucnhash_CAPI == NULL)
3199 goto ucnhashError;
3200 }
3201 if (*s == '{') {
3202 const char *start = s+1;
3203 /* look for the closing brace */
3204 while (*s != '}' && s < end)
3205 s++;
3206 if (s > start && s < end && *s == '}') {
3207 /* found a name. look it up in the unicode database */
3208 message = "unknown Unicode character name";
3209 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003210 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003211 goto store;
3212 }
3213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214 endinpos = s-starts;
3215 outpos = p-PyUnicode_AS_UNICODE(v);
3216 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 errors, &errorHandler,
3218 "unicodeescape", message,
3219 &starts, &end, &startinpos, &endinpos, &exc, &s,
3220 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003221 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003222 break;
3223
3224 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003225 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 message = "\\ at end of string";
3227 s--;
3228 endinpos = s-starts;
3229 outpos = p-PyUnicode_AS_UNICODE(v);
3230 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 errors, &errorHandler,
3232 "unicodeescape", message,
3233 &starts, &end, &startinpos, &endinpos, &exc, &s,
3234 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003235 goto onError;
3236 }
3237 else {
3238 *p++ = '\\';
3239 *p++ = (unsigned char)s[-1];
3240 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003241 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003244 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003246 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003248 Py_XDECREF(errorHandler);
3249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003251
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003253 PyErr_SetString(
3254 PyExc_UnicodeError,
3255 "\\N escapes not supported (can't load unicodedata module)"
3256 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003257 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258 Py_XDECREF(errorHandler);
3259 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003260 return NULL;
3261
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 Py_XDECREF(errorHandler);
3265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 return NULL;
3267}
3268
3269/* Return a Unicode-Escape string version of the Unicode object.
3270
3271 If quotes is true, the string is enclosed in u"" or u'' quotes as
3272 appropriate.
3273
3274*/
3275
Thomas Wouters477c8d52006-05-27 19:21:47 +00003276Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 Py_ssize_t size,
3278 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003279{
3280 /* like wcschr, but doesn't stop at NULL characters */
3281
3282 while (size-- > 0) {
3283 if (*s == ch)
3284 return s;
3285 s++;
3286 }
3287
3288 return NULL;
3289}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003290
Walter Dörwald79e913e2007-05-12 11:08:06 +00003291static const char *hexdigits = "0123456789abcdef";
3292
3293PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003296 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003299#ifdef Py_UNICODE_WIDE
3300 const Py_ssize_t expandsize = 10;
3301#else
3302 const Py_ssize_t expandsize = 6;
3303#endif
3304
Thomas Wouters89f507f2006-12-13 04:49:30 +00003305 /* XXX(nnorwitz): rather than over-allocating, it would be
3306 better to choose a different scheme. Perhaps scan the
3307 first N-chars of the string and allocate based on that size.
3308 */
3309 /* Initial allocation is based on the longest-possible unichr
3310 escape.
3311
3312 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3313 unichr, so in this case it's the longest unichr escape. In
3314 narrow (UTF-16) builds this is five chars per source unichr
3315 since there are two unichrs in the surrogate pair, so in narrow
3316 (UTF-16) builds it's not the longest unichr escape.
3317
3318 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3319 so in the narrow (UTF-16) build case it's the longest unichr
3320 escape.
3321 */
3322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 if (size == 0)
3324 return PyBytes_FromStringAndSize(NULL, 0);
3325
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003326 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003328
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003329 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 2
3331 + expandsize*size
3332 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (repr == NULL)
3334 return NULL;
3335
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003336 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 while (size-- > 0) {
3339 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003340
Walter Dörwald79e913e2007-05-12 11:08:06 +00003341 /* Escape backslashes */
3342 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 *p++ = '\\';
3344 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003345 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003346 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003347
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003349 /* Map 21-bit characters to '\U00xxxxxx' */
3350 else if (ch >= 0x10000) {
3351 *p++ = '\\';
3352 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003353 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3354 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3355 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3356 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3357 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3358 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3359 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3360 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003362 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003363#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3365 else if (ch >= 0xD800 && ch < 0xDC00) {
3366 Py_UNICODE ch2;
3367 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003368
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 ch2 = *s++;
3370 size--;
3371 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3372 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3373 *p++ = '\\';
3374 *p++ = 'U';
3375 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3376 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3377 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3378 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3379 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3380 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3381 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3382 *p++ = hexdigits[ucs & 0x0000000F];
3383 continue;
3384 }
3385 /* Fall through: isolated surrogates are copied as-is */
3386 s--;
3387 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003388 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003389#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003390
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003392 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 *p++ = '\\';
3394 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003395 *p++ = hexdigits[(ch >> 12) & 0x000F];
3396 *p++ = hexdigits[(ch >> 8) & 0x000F];
3397 *p++ = hexdigits[(ch >> 4) & 0x000F];
3398 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003400
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003401 /* Map special whitespace to '\t', \n', '\r' */
3402 else if (ch == '\t') {
3403 *p++ = '\\';
3404 *p++ = 't';
3405 }
3406 else if (ch == '\n') {
3407 *p++ = '\\';
3408 *p++ = 'n';
3409 }
3410 else if (ch == '\r') {
3411 *p++ = '\\';
3412 *p++ = 'r';
3413 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003414
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003415 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003416 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003418 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003419 *p++ = hexdigits[(ch >> 4) & 0x000F];
3420 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003421 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003422
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 /* Copy everything else as-is */
3424 else
3425 *p++ = (char) ch;
3426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003428 assert(p - PyBytes_AS_STRING(repr) > 0);
3429 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3430 return NULL;
3431 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432}
3433
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003434PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003436 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 if (!PyUnicode_Check(unicode)) {
3438 PyErr_BadArgument();
3439 return NULL;
3440 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003441 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3442 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003443 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444}
3445
3446/* --- Raw Unicode Escape Codec ------------------------------------------- */
3447
3448PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 Py_ssize_t size,
3450 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003453 Py_ssize_t startinpos;
3454 Py_ssize_t endinpos;
3455 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 const char *end;
3459 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 PyObject *errorHandler = NULL;
3461 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003462
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 /* Escaped strings will always be longer than the resulting
3464 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 length after conversion to the true value. (But decoding error
3466 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 v = _PyUnicode_New(size);
3468 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 end = s + size;
3474 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003475 unsigned char c;
3476 Py_UCS4 x;
3477 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003478 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 /* Non-escape characters are interpreted as Unicode ordinals */
3481 if (*s != '\\') {
3482 *p++ = (unsigned char)*s++;
3483 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 startinpos = s-starts;
3486
3487 /* \u-escapes are only interpreted iff the number of leading
3488 backslashes if odd */
3489 bs = s;
3490 for (;s < end;) {
3491 if (*s != '\\')
3492 break;
3493 *p++ = (unsigned char)*s++;
3494 }
3495 if (((s - bs) & 1) == 0 ||
3496 s >= end ||
3497 (*s != 'u' && *s != 'U')) {
3498 continue;
3499 }
3500 p--;
3501 count = *s=='u' ? 4 : 8;
3502 s++;
3503
3504 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3505 outpos = p-PyUnicode_AS_UNICODE(v);
3506 for (x = 0, i = 0; i < count; ++i, ++s) {
3507 c = (unsigned char)*s;
3508 if (!ISXDIGIT(c)) {
3509 endinpos = s-starts;
3510 if (unicode_decode_call_errorhandler(
3511 errors, &errorHandler,
3512 "rawunicodeescape", "truncated \\uXXXX",
3513 &starts, &end, &startinpos, &endinpos, &exc, &s,
3514 &v, &outpos, &p))
3515 goto onError;
3516 goto nextByte;
3517 }
3518 x = (x<<4) & ~0xF;
3519 if (c >= '0' && c <= '9')
3520 x += c - '0';
3521 else if (c >= 'a' && c <= 'f')
3522 x += 10 + c - 'a';
3523 else
3524 x += 10 + c - 'A';
3525 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003526 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 /* UCS-2 character */
3528 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003529 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003530 /* UCS-4 character. Either store directly, or as
3531 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003532#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003534#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 x -= 0x10000L;
3536 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3537 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003538#endif
3539 } else {
3540 endinpos = s-starts;
3541 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003542 if (unicode_decode_call_errorhandler(
3543 errors, &errorHandler,
3544 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 &starts, &end, &startinpos, &endinpos, &exc, &s,
3546 &v, &outpos, &p))
3547 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 nextByte:
3550 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003552 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 Py_XDECREF(errorHandler);
3555 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003557
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 Py_XDECREF(errorHandler);
3561 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 return NULL;
3563}
3564
3565PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003568 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 char *p;
3570 char *q;
3571
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003572#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003573 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003574#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003575 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003576#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003577
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003578 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003580
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003581 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 if (repr == NULL)
3583 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003584 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003585 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003587 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 while (size-- > 0) {
3589 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003590#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 /* Map 32-bit characters to '\Uxxxxxxxx' */
3592 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003593 *p++ = '\\';
3594 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003595 *p++ = hexdigits[(ch >> 28) & 0xf];
3596 *p++ = hexdigits[(ch >> 24) & 0xf];
3597 *p++ = hexdigits[(ch >> 20) & 0xf];
3598 *p++ = hexdigits[(ch >> 16) & 0xf];
3599 *p++ = hexdigits[(ch >> 12) & 0xf];
3600 *p++ = hexdigits[(ch >> 8) & 0xf];
3601 *p++ = hexdigits[(ch >> 4) & 0xf];
3602 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003603 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003604 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003605#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003606 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3607 if (ch >= 0xD800 && ch < 0xDC00) {
3608 Py_UNICODE ch2;
3609 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003610
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 ch2 = *s++;
3612 size--;
3613 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3614 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3615 *p++ = '\\';
3616 *p++ = 'U';
3617 *p++ = hexdigits[(ucs >> 28) & 0xf];
3618 *p++ = hexdigits[(ucs >> 24) & 0xf];
3619 *p++ = hexdigits[(ucs >> 20) & 0xf];
3620 *p++ = hexdigits[(ucs >> 16) & 0xf];
3621 *p++ = hexdigits[(ucs >> 12) & 0xf];
3622 *p++ = hexdigits[(ucs >> 8) & 0xf];
3623 *p++ = hexdigits[(ucs >> 4) & 0xf];
3624 *p++ = hexdigits[ucs & 0xf];
3625 continue;
3626 }
3627 /* Fall through: isolated surrogates are copied as-is */
3628 s--;
3629 size++;
3630 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003631#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 /* Map 16-bit characters to '\uxxxx' */
3633 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 *p++ = '\\';
3635 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003636 *p++ = hexdigits[(ch >> 12) & 0xf];
3637 *p++ = hexdigits[(ch >> 8) & 0xf];
3638 *p++ = hexdigits[(ch >> 4) & 0xf];
3639 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 /* Copy everything else as-is */
3642 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 *p++ = (char) ch;
3644 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003645 size = p - q;
3646
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003647 assert(size > 0);
3648 if (_PyBytes_Resize(&repr, size) < 0)
3649 return NULL;
3650 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651}
3652
3653PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3654{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003655 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003657 PyErr_BadArgument();
3658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003660 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3661 PyUnicode_GET_SIZE(unicode));
3662
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003663 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664}
3665
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003666/* --- Unicode Internal Codec ------------------------------------------- */
3667
3668PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 Py_ssize_t size,
3670 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003671{
3672 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003673 Py_ssize_t startinpos;
3674 Py_ssize_t endinpos;
3675 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003676 PyUnicodeObject *v;
3677 Py_UNICODE *p;
3678 const char *end;
3679 const char *reason;
3680 PyObject *errorHandler = NULL;
3681 PyObject *exc = NULL;
3682
Neal Norwitzd43069c2006-01-08 01:12:10 +00003683#ifdef Py_UNICODE_WIDE
3684 Py_UNICODE unimax = PyUnicode_GetMax();
3685#endif
3686
Thomas Wouters89f507f2006-12-13 04:49:30 +00003687 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003688 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3689 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003691 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003693 p = PyUnicode_AS_UNICODE(v);
3694 end = s + size;
3695
3696 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003697 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003698 /* We have to sanity check the raw data, otherwise doom looms for
3699 some malformed UCS-4 data. */
3700 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003701#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003702 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003703#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003704 end-s < Py_UNICODE_SIZE
3705 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003706 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003707 startinpos = s - starts;
3708 if (end-s < Py_UNICODE_SIZE) {
3709 endinpos = end-starts;
3710 reason = "truncated input";
3711 }
3712 else {
3713 endinpos = s - starts + Py_UNICODE_SIZE;
3714 reason = "illegal code point (> 0x10FFFF)";
3715 }
3716 outpos = p - PyUnicode_AS_UNICODE(v);
3717 if (unicode_decode_call_errorhandler(
3718 errors, &errorHandler,
3719 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003720 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003721 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003722 goto onError;
3723 }
3724 }
3725 else {
3726 p++;
3727 s += Py_UNICODE_SIZE;
3728 }
3729 }
3730
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003731 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003732 goto onError;
3733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
3735 return (PyObject *)v;
3736
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003738 Py_XDECREF(v);
3739 Py_XDECREF(errorHandler);
3740 Py_XDECREF(exc);
3741 return NULL;
3742}
3743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744/* --- Latin-1 Codec ------------------------------------------------------ */
3745
3746PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003747 Py_ssize_t size,
3748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749{
3750 PyUnicodeObject *v;
3751 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003752 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003755 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 Py_UNICODE r = *(unsigned char*)s;
3757 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003758 }
3759
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 v = _PyUnicode_New(size);
3761 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003766 e = s + size;
3767 /* Unrolling the copy makes it much faster by reducing the looping
3768 overhead. This is similar to what many memcpy() implementations do. */
3769 unrolled_end = e - 4;
3770 while (s < unrolled_end) {
3771 p[0] = (unsigned char) s[0];
3772 p[1] = (unsigned char) s[1];
3773 p[2] = (unsigned char) s[2];
3774 p[3] = (unsigned char) s[3];
3775 s += 4;
3776 p += 4;
3777 }
3778 while (s < e)
3779 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003781
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 Py_XDECREF(v);
3784 return NULL;
3785}
3786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787/* create or adjust a UnicodeEncodeError */
3788static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003789 const char *encoding,
3790 const Py_UNICODE *unicode, Py_ssize_t size,
3791 Py_ssize_t startpos, Py_ssize_t endpos,
3792 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 *exceptionObject = PyUnicodeEncodeError_Create(
3796 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 }
3798 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3800 goto onError;
3801 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3802 goto onError;
3803 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3804 goto onError;
3805 return;
3806 onError:
3807 Py_DECREF(*exceptionObject);
3808 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 }
3810}
3811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812/* raises a UnicodeEncodeError */
3813static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 const char *encoding,
3815 const Py_UNICODE *unicode, Py_ssize_t size,
3816 Py_ssize_t startpos, Py_ssize_t endpos,
3817 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818{
3819 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823}
3824
3825/* error handling callback helper:
3826 build arguments, call the callback and check the arguments,
3827 put the result into newpos and return the replacement string, which
3828 has to be freed by the caller */
3829static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 PyObject **errorHandler,
3831 const char *encoding, const char *reason,
3832 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3833 Py_ssize_t startpos, Py_ssize_t endpos,
3834 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003836 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837
3838 PyObject *restuple;
3839 PyObject *resunicode;
3840
3841 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003843 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003844 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 }
3846
3847 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851
3852 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003854 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003857 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003858 Py_DECREF(restuple);
3859 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 }
3861 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 &resunicode, newpos)) {
3863 Py_DECREF(restuple);
3864 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 }
3866 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003868 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3870 Py_DECREF(restuple);
3871 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003872 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 Py_INCREF(resunicode);
3874 Py_DECREF(restuple);
3875 return resunicode;
3876}
3877
3878static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00003879 Py_ssize_t size,
3880 const char *errors,
3881 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882{
3883 /* output object */
3884 PyObject *res;
3885 /* pointers to the beginning and end+1 of input */
3886 const Py_UNICODE *startp = p;
3887 const Py_UNICODE *endp = p + size;
3888 /* pointer to the beginning of the unencodable characters */
3889 /* const Py_UNICODE *badp = NULL; */
3890 /* pointer into the output */
3891 char *str;
3892 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003894 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3895 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 PyObject *errorHandler = NULL;
3897 PyObject *exc = NULL;
3898 /* the following variable is used for caching string comparisons
3899 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3900 int known_errorHandler = -1;
3901
3902 /* allocate enough for a simple encoding without
3903 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003904 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003905 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003906 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003908 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003909 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 ressize = size;
3911
3912 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914
Benjamin Peterson29060642009-01-31 22:14:21 +00003915 /* can we encode this? */
3916 if (c<limit) {
3917 /* no overflow check, because we know that the space is enough */
3918 *str++ = (char)c;
3919 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003920 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 else {
3922 Py_ssize_t unicodepos = p-startp;
3923 Py_ssize_t requiredsize;
3924 PyObject *repunicode;
3925 Py_ssize_t repsize;
3926 Py_ssize_t newpos;
3927 Py_ssize_t respos;
3928 Py_UNICODE *uni2;
3929 /* startpos for collecting unencodable chars */
3930 const Py_UNICODE *collstart = p;
3931 const Py_UNICODE *collend = p;
3932 /* find all unecodable characters */
3933 while ((collend < endp) && ((*collend)>=limit))
3934 ++collend;
3935 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3936 if (known_errorHandler==-1) {
3937 if ((errors==NULL) || (!strcmp(errors, "strict")))
3938 known_errorHandler = 1;
3939 else if (!strcmp(errors, "replace"))
3940 known_errorHandler = 2;
3941 else if (!strcmp(errors, "ignore"))
3942 known_errorHandler = 3;
3943 else if (!strcmp(errors, "xmlcharrefreplace"))
3944 known_errorHandler = 4;
3945 else
3946 known_errorHandler = 0;
3947 }
3948 switch (known_errorHandler) {
3949 case 1: /* strict */
3950 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3951 goto onError;
3952 case 2: /* replace */
3953 while (collstart++<collend)
3954 *str++ = '?'; /* fall through */
3955 case 3: /* ignore */
3956 p = collend;
3957 break;
3958 case 4: /* xmlcharrefreplace */
3959 respos = str - PyBytes_AS_STRING(res);
3960 /* determine replacement size (temporarily (mis)uses p) */
3961 for (p = collstart, repsize = 0; p < collend; ++p) {
3962 if (*p<10)
3963 repsize += 2+1+1;
3964 else if (*p<100)
3965 repsize += 2+2+1;
3966 else if (*p<1000)
3967 repsize += 2+3+1;
3968 else if (*p<10000)
3969 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003970#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 else
3972 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003973#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 else if (*p<100000)
3975 repsize += 2+5+1;
3976 else if (*p<1000000)
3977 repsize += 2+6+1;
3978 else
3979 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003980#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 }
3982 requiredsize = respos+repsize+(endp-collend);
3983 if (requiredsize > ressize) {
3984 if (requiredsize<2*ressize)
3985 requiredsize = 2*ressize;
3986 if (_PyBytes_Resize(&res, requiredsize))
3987 goto onError;
3988 str = PyBytes_AS_STRING(res) + respos;
3989 ressize = requiredsize;
3990 }
3991 /* generate replacement (temporarily (mis)uses p) */
3992 for (p = collstart; p < collend; ++p) {
3993 str += sprintf(str, "&#%d;", (int)*p);
3994 }
3995 p = collend;
3996 break;
3997 default:
3998 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3999 encoding, reason, startp, size, &exc,
4000 collstart-startp, collend-startp, &newpos);
4001 if (repunicode == NULL)
4002 goto onError;
4003 /* need more space? (at least enough for what we
4004 have+the replacement+the rest of the string, so
4005 we won't have to check space for encodable characters) */
4006 respos = str - PyBytes_AS_STRING(res);
4007 repsize = PyUnicode_GET_SIZE(repunicode);
4008 requiredsize = respos+repsize+(endp-collend);
4009 if (requiredsize > ressize) {
4010 if (requiredsize<2*ressize)
4011 requiredsize = 2*ressize;
4012 if (_PyBytes_Resize(&res, requiredsize)) {
4013 Py_DECREF(repunicode);
4014 goto onError;
4015 }
4016 str = PyBytes_AS_STRING(res) + respos;
4017 ressize = requiredsize;
4018 }
4019 /* check if there is anything unencodable in the replacement
4020 and copy it to the output */
4021 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4022 c = *uni2;
4023 if (c >= limit) {
4024 raise_encode_exception(&exc, encoding, startp, size,
4025 unicodepos, unicodepos+1, reason);
4026 Py_DECREF(repunicode);
4027 goto onError;
4028 }
4029 *str = (char)c;
4030 }
4031 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004032 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004034 }
4035 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004036 /* Resize if we allocated to much */
4037 size = str - PyBytes_AS_STRING(res);
4038 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004039 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004040 if (_PyBytes_Resize(&res, size) < 0)
4041 goto onError;
4042 }
4043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 Py_XDECREF(errorHandler);
4045 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004046 return res;
4047
4048 onError:
4049 Py_XDECREF(res);
4050 Py_XDECREF(errorHandler);
4051 Py_XDECREF(exc);
4052 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053}
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 Py_ssize_t size,
4057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060}
4061
4062PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4063{
4064 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 PyErr_BadArgument();
4066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 }
4068 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 PyUnicode_GET_SIZE(unicode),
4070 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071}
4072
4073/* --- 7-bit ASCII Codec -------------------------------------------------- */
4074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 Py_ssize_t size,
4077 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 PyUnicodeObject *v;
4081 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004082 Py_ssize_t startinpos;
4083 Py_ssize_t endinpos;
4084 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 const char *e;
4086 PyObject *errorHandler = NULL;
4087 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004088
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004090 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 Py_UNICODE r = *(unsigned char*)s;
4092 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004093 }
Tim Petersced69f82003-09-16 20:30:58 +00004094
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 v = _PyUnicode_New(size);
4096 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 e = s + size;
4102 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 register unsigned char c = (unsigned char)*s;
4104 if (c < 128) {
4105 *p++ = c;
4106 ++s;
4107 }
4108 else {
4109 startinpos = s-starts;
4110 endinpos = startinpos + 1;
4111 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4112 if (unicode_decode_call_errorhandler(
4113 errors, &errorHandler,
4114 "ascii", "ordinal not in range(128)",
4115 &starts, &e, &startinpos, &endinpos, &exc, &s,
4116 &v, &outpos, &p))
4117 goto onError;
4118 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004120 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 Py_XDECREF(errorHandler);
4124 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004126
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 Py_XDECREF(errorHandler);
4130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 return NULL;
4132}
4133
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 Py_ssize_t size,
4136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139}
4140
4141PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4142{
4143 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 PyErr_BadArgument();
4145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 }
4147 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 PyUnicode_GET_SIZE(unicode),
4149 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150}
4151
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004153
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004154/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004155
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004156#if SIZEOF_INT < SIZEOF_SSIZE_T
4157#define NEED_RETRY
4158#endif
4159
4160/* XXX This code is limited to "true" double-byte encodings, as
4161 a) it assumes an incomplete character consists of a single byte, and
4162 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004164
4165static int is_dbcs_lead_byte(const char *s, int offset)
4166{
4167 const char *curr = s + offset;
4168
4169 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 const char *prev = CharPrev(s, curr);
4171 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004172 }
4173 return 0;
4174}
4175
4176/*
4177 * Decode MBCS string into unicode object. If 'final' is set, converts
4178 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4179 */
4180static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 const char *s, /* MBCS string */
4182 int size, /* sizeof MBCS string */
4183 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004184{
4185 Py_UNICODE *p;
4186 Py_ssize_t n = 0;
4187 int usize = 0;
4188
4189 assert(size >= 0);
4190
4191 /* Skip trailing lead-byte unless 'final' is set */
4192 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004194
4195 /* First get the size of the result */
4196 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4198 if (usize == 0) {
4199 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4200 return -1;
4201 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004202 }
4203
4204 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 /* Create unicode object */
4206 *v = _PyUnicode_New(usize);
4207 if (*v == NULL)
4208 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004209 }
4210 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 /* Extend unicode object */
4212 n = PyUnicode_GET_SIZE(*v);
4213 if (_PyUnicode_Resize(v, n + usize) < 0)
4214 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004215 }
4216
4217 /* Do the conversion */
4218 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 p = PyUnicode_AS_UNICODE(*v) + n;
4220 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4221 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4222 return -1;
4223 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004224 }
4225
4226 return size;
4227}
4228
4229PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 Py_ssize_t size,
4231 const char *errors,
4232 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004233{
4234 PyUnicodeObject *v = NULL;
4235 int done;
4236
4237 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004239
4240#ifdef NEED_RETRY
4241 retry:
4242 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004244 else
4245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004247
4248 if (done < 0) {
4249 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004251 }
4252
4253 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004255
4256#ifdef NEED_RETRY
4257 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 s += done;
4259 size -= done;
4260 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004261 }
4262#endif
4263
4264 return (PyObject *)v;
4265}
4266
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004267PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 Py_ssize_t size,
4269 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004270{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004271 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4272}
4273
4274/*
4275 * Convert unicode into string object (MBCS).
4276 * Returns 0 if succeed, -1 otherwise.
4277 */
4278static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 const Py_UNICODE *p, /* unicode */
4280 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004281{
4282 int mbcssize = 0;
4283 Py_ssize_t n = 0;
4284
4285 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004286
4287 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004288 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4290 if (mbcssize == 0) {
4291 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4292 return -1;
4293 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004294 }
4295
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004296 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 /* Create string object */
4298 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4299 if (*repr == NULL)
4300 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004301 }
4302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 /* Extend string object */
4304 n = PyBytes_Size(*repr);
4305 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4306 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004307 }
4308
4309 /* Do the conversion */
4310 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 char *s = PyBytes_AS_STRING(*repr) + n;
4312 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4313 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4314 return -1;
4315 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004316 }
4317
4318 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004319}
4320
4321PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 Py_ssize_t size,
4323 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004324{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004325 PyObject *repr = NULL;
4326 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004327
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004328#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004330 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004332 else
4333#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004336 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 Py_XDECREF(repr);
4338 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004339 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004340
4341#ifdef NEED_RETRY
4342 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 p += INT_MAX;
4344 size -= INT_MAX;
4345 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004346 }
4347#endif
4348
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004349 return repr;
4350}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004351
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004352PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4353{
4354 if (!PyUnicode_Check(unicode)) {
4355 PyErr_BadArgument();
4356 return NULL;
4357 }
4358 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 PyUnicode_GET_SIZE(unicode),
4360 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004361}
4362
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004363#undef NEED_RETRY
4364
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004365#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004366
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367/* --- Character Mapping Codec -------------------------------------------- */
4368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 Py_ssize_t size,
4371 PyObject *mapping,
4372 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004375 Py_ssize_t startinpos;
4376 Py_ssize_t endinpos;
4377 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 PyUnicodeObject *v;
4380 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 PyObject *errorHandler = NULL;
4383 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004384 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 /* Default to Latin-1 */
4388 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390
4391 v = _PyUnicode_New(size);
4392 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004398 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 mapstring = PyUnicode_AS_UNICODE(mapping);
4400 maplen = PyUnicode_GET_SIZE(mapping);
4401 while (s < e) {
4402 unsigned char ch = *s;
4403 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 if (ch < maplen)
4406 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 if (x == 0xfffe) {
4409 /* undefined mapping */
4410 outpos = p-PyUnicode_AS_UNICODE(v);
4411 startinpos = s-starts;
4412 endinpos = startinpos+1;
4413 if (unicode_decode_call_errorhandler(
4414 errors, &errorHandler,
4415 "charmap", "character maps to <undefined>",
4416 &starts, &e, &startinpos, &endinpos, &exc, &s,
4417 &v, &outpos, &p)) {
4418 goto onError;
4419 }
4420 continue;
4421 }
4422 *p++ = x;
4423 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004424 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004425 }
4426 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 while (s < e) {
4428 unsigned char ch = *s;
4429 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004430
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4432 w = PyLong_FromLong((long)ch);
4433 if (w == NULL)
4434 goto onError;
4435 x = PyObject_GetItem(mapping, w);
4436 Py_DECREF(w);
4437 if (x == NULL) {
4438 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4439 /* No mapping found means: mapping is undefined. */
4440 PyErr_Clear();
4441 x = Py_None;
4442 Py_INCREF(x);
4443 } else
4444 goto onError;
4445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004446
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 /* Apply mapping */
4448 if (PyLong_Check(x)) {
4449 long value = PyLong_AS_LONG(x);
4450 if (value < 0 || value > 65535) {
4451 PyErr_SetString(PyExc_TypeError,
4452 "character mapping must be in range(65536)");
4453 Py_DECREF(x);
4454 goto onError;
4455 }
4456 *p++ = (Py_UNICODE)value;
4457 }
4458 else if (x == Py_None) {
4459 /* undefined mapping */
4460 outpos = p-PyUnicode_AS_UNICODE(v);
4461 startinpos = s-starts;
4462 endinpos = startinpos+1;
4463 if (unicode_decode_call_errorhandler(
4464 errors, &errorHandler,
4465 "charmap", "character maps to <undefined>",
4466 &starts, &e, &startinpos, &endinpos, &exc, &s,
4467 &v, &outpos, &p)) {
4468 Py_DECREF(x);
4469 goto onError;
4470 }
4471 Py_DECREF(x);
4472 continue;
4473 }
4474 else if (PyUnicode_Check(x)) {
4475 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 if (targetsize == 1)
4478 /* 1-1 mapping */
4479 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 else if (targetsize > 1) {
4482 /* 1-n mapping */
4483 if (targetsize > extrachars) {
4484 /* resize first */
4485 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4486 Py_ssize_t needed = (targetsize - extrachars) + \
4487 (targetsize << 2);
4488 extrachars += needed;
4489 /* XXX overflow detection missing */
4490 if (_PyUnicode_Resize(&v,
4491 PyUnicode_GET_SIZE(v) + needed) < 0) {
4492 Py_DECREF(x);
4493 goto onError;
4494 }
4495 p = PyUnicode_AS_UNICODE(v) + oldpos;
4496 }
4497 Py_UNICODE_COPY(p,
4498 PyUnicode_AS_UNICODE(x),
4499 targetsize);
4500 p += targetsize;
4501 extrachars -= targetsize;
4502 }
4503 /* 1-0 mapping: skip the character */
4504 }
4505 else {
4506 /* wrong return value */
4507 PyErr_SetString(PyExc_TypeError,
4508 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004509 Py_DECREF(x);
4510 goto onError;
4511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 Py_DECREF(x);
4513 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 }
4516 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4518 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 Py_XDECREF(errorHandler);
4520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004522
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 Py_XDECREF(errorHandler);
4525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 Py_XDECREF(v);
4527 return NULL;
4528}
4529
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004530/* Charmap encoding: the lookup table */
4531
4532struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 PyObject_HEAD
4534 unsigned char level1[32];
4535 int count2, count3;
4536 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004537};
4538
4539static PyObject*
4540encoding_map_size(PyObject *obj, PyObject* args)
4541{
4542 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004543 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004545}
4546
4547static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004548 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 PyDoc_STR("Return the size (in bytes) of this object") },
4550 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551};
4552
4553static void
4554encoding_map_dealloc(PyObject* o)
4555{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004556 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004557}
4558
4559static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004560 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 "EncodingMap", /*tp_name*/
4562 sizeof(struct encoding_map), /*tp_basicsize*/
4563 0, /*tp_itemsize*/
4564 /* methods */
4565 encoding_map_dealloc, /*tp_dealloc*/
4566 0, /*tp_print*/
4567 0, /*tp_getattr*/
4568 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004569 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 0, /*tp_repr*/
4571 0, /*tp_as_number*/
4572 0, /*tp_as_sequence*/
4573 0, /*tp_as_mapping*/
4574 0, /*tp_hash*/
4575 0, /*tp_call*/
4576 0, /*tp_str*/
4577 0, /*tp_getattro*/
4578 0, /*tp_setattro*/
4579 0, /*tp_as_buffer*/
4580 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4581 0, /*tp_doc*/
4582 0, /*tp_traverse*/
4583 0, /*tp_clear*/
4584 0, /*tp_richcompare*/
4585 0, /*tp_weaklistoffset*/
4586 0, /*tp_iter*/
4587 0, /*tp_iternext*/
4588 encoding_map_methods, /*tp_methods*/
4589 0, /*tp_members*/
4590 0, /*tp_getset*/
4591 0, /*tp_base*/
4592 0, /*tp_dict*/
4593 0, /*tp_descr_get*/
4594 0, /*tp_descr_set*/
4595 0, /*tp_dictoffset*/
4596 0, /*tp_init*/
4597 0, /*tp_alloc*/
4598 0, /*tp_new*/
4599 0, /*tp_free*/
4600 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601};
4602
4603PyObject*
4604PyUnicode_BuildEncodingMap(PyObject* string)
4605{
4606 Py_UNICODE *decode;
4607 PyObject *result;
4608 struct encoding_map *mresult;
4609 int i;
4610 int need_dict = 0;
4611 unsigned char level1[32];
4612 unsigned char level2[512];
4613 unsigned char *mlevel1, *mlevel2, *mlevel3;
4614 int count2 = 0, count3 = 0;
4615
4616 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4617 PyErr_BadArgument();
4618 return NULL;
4619 }
4620 decode = PyUnicode_AS_UNICODE(string);
4621 memset(level1, 0xFF, sizeof level1);
4622 memset(level2, 0xFF, sizeof level2);
4623
4624 /* If there isn't a one-to-one mapping of NULL to \0,
4625 or if there are non-BMP characters, we need to use
4626 a mapping dictionary. */
4627 if (decode[0] != 0)
4628 need_dict = 1;
4629 for (i = 1; i < 256; i++) {
4630 int l1, l2;
4631 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004632#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004633 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004634#endif
4635 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004636 need_dict = 1;
4637 break;
4638 }
4639 if (decode[i] == 0xFFFE)
4640 /* unmapped character */
4641 continue;
4642 l1 = decode[i] >> 11;
4643 l2 = decode[i] >> 7;
4644 if (level1[l1] == 0xFF)
4645 level1[l1] = count2++;
4646 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004647 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004648 }
4649
4650 if (count2 >= 0xFF || count3 >= 0xFF)
4651 need_dict = 1;
4652
4653 if (need_dict) {
4654 PyObject *result = PyDict_New();
4655 PyObject *key, *value;
4656 if (!result)
4657 return NULL;
4658 for (i = 0; i < 256; i++) {
4659 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004660 key = PyLong_FromLong(decode[i]);
4661 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004662 if (!key || !value)
4663 goto failed1;
4664 if (PyDict_SetItem(result, key, value) == -1)
4665 goto failed1;
4666 Py_DECREF(key);
4667 Py_DECREF(value);
4668 }
4669 return result;
4670 failed1:
4671 Py_XDECREF(key);
4672 Py_XDECREF(value);
4673 Py_DECREF(result);
4674 return NULL;
4675 }
4676
4677 /* Create a three-level trie */
4678 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4679 16*count2 + 128*count3 - 1);
4680 if (!result)
4681 return PyErr_NoMemory();
4682 PyObject_Init(result, &EncodingMapType);
4683 mresult = (struct encoding_map*)result;
4684 mresult->count2 = count2;
4685 mresult->count3 = count3;
4686 mlevel1 = mresult->level1;
4687 mlevel2 = mresult->level23;
4688 mlevel3 = mresult->level23 + 16*count2;
4689 memcpy(mlevel1, level1, 32);
4690 memset(mlevel2, 0xFF, 16*count2);
4691 memset(mlevel3, 0, 128*count3);
4692 count3 = 0;
4693 for (i = 1; i < 256; i++) {
4694 int o1, o2, o3, i2, i3;
4695 if (decode[i] == 0xFFFE)
4696 /* unmapped character */
4697 continue;
4698 o1 = decode[i]>>11;
4699 o2 = (decode[i]>>7) & 0xF;
4700 i2 = 16*mlevel1[o1] + o2;
4701 if (mlevel2[i2] == 0xFF)
4702 mlevel2[i2] = count3++;
4703 o3 = decode[i] & 0x7F;
4704 i3 = 128*mlevel2[i2] + o3;
4705 mlevel3[i3] = i;
4706 }
4707 return result;
4708}
4709
4710static int
4711encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4712{
4713 struct encoding_map *map = (struct encoding_map*)mapping;
4714 int l1 = c>>11;
4715 int l2 = (c>>7) & 0xF;
4716 int l3 = c & 0x7F;
4717 int i;
4718
4719#ifdef Py_UNICODE_WIDE
4720 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004722 }
4723#endif
4724 if (c == 0)
4725 return 0;
4726 /* level 1*/
4727 i = map->level1[l1];
4728 if (i == 0xFF) {
4729 return -1;
4730 }
4731 /* level 2*/
4732 i = map->level23[16*i+l2];
4733 if (i == 0xFF) {
4734 return -1;
4735 }
4736 /* level 3 */
4737 i = map->level23[16*map->count2 + 128*i + l3];
4738 if (i == 0) {
4739 return -1;
4740 }
4741 return i;
4742}
4743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744/* Lookup the character ch in the mapping. If the character
4745 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004746 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748{
Christian Heimes217cfd12007-12-02 14:31:20 +00004749 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 PyObject *x;
4751
4752 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 x = PyObject_GetItem(mapping, w);
4755 Py_DECREF(w);
4756 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4758 /* No mapping found means: mapping is undefined. */
4759 PyErr_Clear();
4760 x = Py_None;
4761 Py_INCREF(x);
4762 return x;
4763 } else
4764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004766 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004768 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004769 long value = PyLong_AS_LONG(x);
4770 if (value < 0 || value > 255) {
4771 PyErr_SetString(PyExc_TypeError,
4772 "character mapping must be in range(256)");
4773 Py_DECREF(x);
4774 return NULL;
4775 }
4776 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004778 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 /* wrong return value */
4782 PyErr_Format(PyExc_TypeError,
4783 "character mapping must return integer, bytes or None, not %.400s",
4784 x->ob_type->tp_name);
4785 Py_DECREF(x);
4786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 }
4788}
4789
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004790static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004792{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004793 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4794 /* exponentially overallocate to minimize reallocations */
4795 if (requiredsize < 2*outsize)
4796 requiredsize = 2*outsize;
4797 if (_PyBytes_Resize(outobj, requiredsize))
4798 return -1;
4799 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004800}
4801
Benjamin Peterson14339b62009-01-31 16:36:08 +00004802typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004804}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004806 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 space is available. Return a new reference to the object that
4808 was put in the output buffer, or Py_None, if the mapping was undefined
4809 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004810 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004812charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004815 PyObject *rep;
4816 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004817 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818
Christian Heimes90aa7642007-12-19 02:45:37 +00004819 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004820 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004822 if (res == -1)
4823 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 if (outsize<requiredsize)
4825 if (charmapencode_resize(outobj, outpos, requiredsize))
4826 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004827 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 outstart[(*outpos)++] = (char)res;
4829 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004830 }
4831
4832 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004835 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 Py_DECREF(rep);
4837 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004838 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 if (PyLong_Check(rep)) {
4840 Py_ssize_t requiredsize = *outpos+1;
4841 if (outsize<requiredsize)
4842 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4843 Py_DECREF(rep);
4844 return enc_EXCEPTION;
4845 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004846 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004848 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 else {
4850 const char *repchars = PyBytes_AS_STRING(rep);
4851 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4852 Py_ssize_t requiredsize = *outpos+repsize;
4853 if (outsize<requiredsize)
4854 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4855 Py_DECREF(rep);
4856 return enc_EXCEPTION;
4857 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004858 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 memcpy(outstart + *outpos, repchars, repsize);
4860 *outpos += repsize;
4861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004863 Py_DECREF(rep);
4864 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865}
4866
4867/* handle an error in PyUnicode_EncodeCharmap
4868 Return 0 on success, -1 on error */
4869static
4870int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004873 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004874 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875{
4876 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t repsize;
4878 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 Py_UNICODE *uni2;
4880 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t collstartpos = *inpos;
4882 Py_ssize_t collendpos = *inpos+1;
4883 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 char *encoding = "charmap";
4885 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 /* find all unencodable characters */
4889 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004890 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004891 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 int res = encoding_map_lookup(p[collendpos], mapping);
4893 if (res != -1)
4894 break;
4895 ++collendpos;
4896 continue;
4897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004898
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 rep = charmapencode_lookup(p[collendpos], mapping);
4900 if (rep==NULL)
4901 return -1;
4902 else if (rep!=Py_None) {
4903 Py_DECREF(rep);
4904 break;
4905 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004906 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 }
4909 /* cache callback name lookup
4910 * (if not done yet, i.e. it's the first error) */
4911 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 if ((errors==NULL) || (!strcmp(errors, "strict")))
4913 *known_errorHandler = 1;
4914 else if (!strcmp(errors, "replace"))
4915 *known_errorHandler = 2;
4916 else if (!strcmp(errors, "ignore"))
4917 *known_errorHandler = 3;
4918 else if (!strcmp(errors, "xmlcharrefreplace"))
4919 *known_errorHandler = 4;
4920 else
4921 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 }
4923 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004924 case 1: /* strict */
4925 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4926 return -1;
4927 case 2: /* replace */
4928 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 x = charmapencode_output('?', mapping, res, respos);
4930 if (x==enc_EXCEPTION) {
4931 return -1;
4932 }
4933 else if (x==enc_FAILED) {
4934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4935 return -1;
4936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004937 }
4938 /* fall through */
4939 case 3: /* ignore */
4940 *inpos = collendpos;
4941 break;
4942 case 4: /* xmlcharrefreplace */
4943 /* generate replacement (temporarily (mis)uses p) */
4944 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 char buffer[2+29+1+1];
4946 char *cp;
4947 sprintf(buffer, "&#%d;", (int)p[collpos]);
4948 for (cp = buffer; *cp; ++cp) {
4949 x = charmapencode_output(*cp, mapping, res, respos);
4950 if (x==enc_EXCEPTION)
4951 return -1;
4952 else if (x==enc_FAILED) {
4953 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4954 return -1;
4955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004956 }
4957 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004958 *inpos = collendpos;
4959 break;
4960 default:
4961 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 encoding, reason, p, size, exceptionObject,
4963 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004964 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 return -1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004966 /* generate replacement */
4967 repsize = PyUnicode_GET_SIZE(repunicode);
4968 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 x = charmapencode_output(*uni2, mapping, res, respos);
4970 if (x==enc_EXCEPTION) {
4971 return -1;
4972 }
4973 else if (x==enc_FAILED) {
4974 Py_DECREF(repunicode);
4975 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4976 return -1;
4977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004978 }
4979 *inpos = newpos;
4980 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 }
4982 return 0;
4983}
4984
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 Py_ssize_t size,
4987 PyObject *mapping,
4988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990 /* output object */
4991 PyObject *res = NULL;
4992 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 PyObject *errorHandler = NULL;
4997 PyObject *exc = NULL;
4998 /* the following variable is used for caching string comparisons
4999 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5000 * 3=ignore, 4=xmlcharrefreplace */
5001 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002
5003 /* Default to Latin-1 */
5004 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 /* allocate enough for a simple encoding without
5008 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005009 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 if (res == NULL)
5011 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005012 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 /* try to encode it */
5017 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5018 if (x==enc_EXCEPTION) /* error */
5019 goto onError;
5020 if (x==enc_FAILED) { /* unencodable character */
5021 if (charmap_encoding_error(p, size, &inpos, mapping,
5022 &exc,
5023 &known_errorHandler, &errorHandler, errors,
5024 &res, &respos)) {
5025 goto onError;
5026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005027 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 else
5029 /* done with this character => adjust input position */
5030 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005034 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005035 if (_PyBytes_Resize(&res, respos) < 0)
5036 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 Py_XDECREF(exc);
5039 Py_XDECREF(errorHandler);
5040 return res;
5041
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 Py_XDECREF(res);
5044 Py_XDECREF(exc);
5045 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return NULL;
5047}
5048
5049PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
5052 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 PyErr_BadArgument();
5054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 }
5056 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 PyUnicode_GET_SIZE(unicode),
5058 mapping,
5059 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060}
5061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062/* create or adjust a UnicodeTranslateError */
5063static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 const Py_UNICODE *unicode, Py_ssize_t size,
5065 Py_ssize_t startpos, Py_ssize_t endpos,
5066 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005069 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
5072 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5074 goto onError;
5075 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5076 goto onError;
5077 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5078 goto onError;
5079 return;
5080 onError:
5081 Py_DECREF(*exceptionObject);
5082 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 }
5084}
5085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086/* raises a UnicodeTranslateError */
5087static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 const Py_UNICODE *unicode, Py_ssize_t size,
5089 Py_ssize_t startpos, Py_ssize_t endpos,
5090 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091{
5092 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096}
5097
5098/* error handling callback helper:
5099 build arguments, call the callback and check the arguments,
5100 put the result into newpos and return the replacement string, which
5101 has to be freed by the caller */
5102static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 PyObject **errorHandler,
5104 const char *reason,
5105 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5106 Py_ssize_t startpos, Py_ssize_t endpos,
5107 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005108{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005109 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005111 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 PyObject *restuple;
5113 PyObject *resunicode;
5114
5115 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 }
5120
5121 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125
5126 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005131 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 Py_DECREF(restuple);
5133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 }
5135 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 &resunicode, &i_newpos)) {
5137 Py_DECREF(restuple);
5138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005142 else
5143 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005144 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5146 Py_DECREF(restuple);
5147 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 Py_INCREF(resunicode);
5150 Py_DECREF(restuple);
5151 return resunicode;
5152}
5153
5154/* Lookup the character ch in the mapping and put the result in result,
5155 which must be decrefed by the caller.
5156 Return 0 on success, -1 on error */
5157static
5158int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5159{
Christian Heimes217cfd12007-12-02 14:31:20 +00005160 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005161 PyObject *x;
5162
5163 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 x = PyObject_GetItem(mapping, w);
5166 Py_DECREF(w);
5167 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5169 /* No mapping found means: use 1:1 mapping. */
5170 PyErr_Clear();
5171 *result = NULL;
5172 return 0;
5173 } else
5174 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175 }
5176 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 *result = x;
5178 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005180 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 long value = PyLong_AS_LONG(x);
5182 long max = PyUnicode_GetMax();
5183 if (value < 0 || value > max) {
5184 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005185 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 Py_DECREF(x);
5187 return -1;
5188 }
5189 *result = x;
5190 return 0;
5191 }
5192 else if (PyUnicode_Check(x)) {
5193 *result = x;
5194 return 0;
5195 }
5196 else {
5197 /* wrong return value */
5198 PyErr_SetString(PyExc_TypeError,
5199 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005200 Py_DECREF(x);
5201 return -1;
5202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203}
5204/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 if not reallocate and adjust various state variables.
5206 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207static
Walter Dörwald4894c302003-10-24 14:25:28 +00005208int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005211 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005212 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 /* remember old output position */
5214 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5215 /* exponentially overallocate to minimize reallocations */
5216 if (requiredsize < 2 * oldsize)
5217 requiredsize = 2 * oldsize;
5218 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5219 return -1;
5220 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 }
5222 return 0;
5223}
5224/* lookup the character, put the result in the output string and adjust
5225 various state variables. Return a new reference to the object that
5226 was put in the output buffer in *result, or Py_None, if the mapping was
5227 undefined (in which case no character was written).
5228 The called must decref result.
5229 Return 0 on success, -1 on error. */
5230static
Walter Dörwald4894c302003-10-24 14:25:28 +00005231int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5233 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234{
Walter Dörwald4894c302003-10-24 14:25:28 +00005235 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 /* not found => default to 1:1 mapping */
5239 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005240 }
5241 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005243 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 /* no overflow check, because we know that the space is enough */
5245 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 }
5247 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5249 if (repsize==1) {
5250 /* no overflow check, because we know that the space is enough */
5251 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5252 }
5253 else if (repsize!=0) {
5254 /* more than one character */
5255 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5256 (insize - (curinp-startinp)) +
5257 repsize - 1;
5258 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5259 return -1;
5260 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5261 *outp += repsize;
5262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263 }
5264 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 return 0;
5267}
5268
5269PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_ssize_t size,
5271 PyObject *mapping,
5272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274 /* output object */
5275 PyObject *res = NULL;
5276 /* pointers to the beginning and end+1 of input */
5277 const Py_UNICODE *startp = p;
5278 const Py_UNICODE *endp = p + size;
5279 /* pointer into the output */
5280 Py_UNICODE *str;
5281 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005282 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 char *reason = "character maps to <undefined>";
5284 PyObject *errorHandler = NULL;
5285 PyObject *exc = NULL;
5286 /* the following variable is used for caching string comparisons
5287 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5288 * 3=ignore, 4=xmlcharrefreplace */
5289 int known_errorHandler = -1;
5290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 PyErr_BadArgument();
5293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295
5296 /* allocate enough for a simple 1:1 translation without
5297 replacements, if we need more, we'll resize */
5298 res = PyUnicode_FromUnicode(NULL, size);
5299 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005303 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 /* try to encode it */
5307 PyObject *x = NULL;
5308 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5309 Py_XDECREF(x);
5310 goto onError;
5311 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005312 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 if (x!=Py_None) /* it worked => adjust input pointer */
5314 ++p;
5315 else { /* untranslatable character */
5316 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5317 Py_ssize_t repsize;
5318 Py_ssize_t newpos;
5319 Py_UNICODE *uni2;
5320 /* startpos for collecting untranslatable chars */
5321 const Py_UNICODE *collstart = p;
5322 const Py_UNICODE *collend = p+1;
5323 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 /* find all untranslatable characters */
5326 while (collend < endp) {
5327 if (charmaptranslate_lookup(*collend, mapping, &x))
5328 goto onError;
5329 Py_XDECREF(x);
5330 if (x!=Py_None)
5331 break;
5332 ++collend;
5333 }
5334 /* cache callback name lookup
5335 * (if not done yet, i.e. it's the first error) */
5336 if (known_errorHandler==-1) {
5337 if ((errors==NULL) || (!strcmp(errors, "strict")))
5338 known_errorHandler = 1;
5339 else if (!strcmp(errors, "replace"))
5340 known_errorHandler = 2;
5341 else if (!strcmp(errors, "ignore"))
5342 known_errorHandler = 3;
5343 else if (!strcmp(errors, "xmlcharrefreplace"))
5344 known_errorHandler = 4;
5345 else
5346 known_errorHandler = 0;
5347 }
5348 switch (known_errorHandler) {
5349 case 1: /* strict */
5350 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005351 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 case 2: /* replace */
5353 /* No need to check for space, this is a 1:1 replacement */
5354 for (coll = collstart; coll<collend; ++coll)
5355 *str++ = '?';
5356 /* fall through */
5357 case 3: /* ignore */
5358 p = collend;
5359 break;
5360 case 4: /* xmlcharrefreplace */
5361 /* generate replacement (temporarily (mis)uses p) */
5362 for (p = collstart; p < collend; ++p) {
5363 char buffer[2+29+1+1];
5364 char *cp;
5365 sprintf(buffer, "&#%d;", (int)*p);
5366 if (charmaptranslate_makespace(&res, &str,
5367 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5368 goto onError;
5369 for (cp = buffer; *cp; ++cp)
5370 *str++ = *cp;
5371 }
5372 p = collend;
5373 break;
5374 default:
5375 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5376 reason, startp, size, &exc,
5377 collstart-startp, collend-startp, &newpos);
5378 if (repunicode == NULL)
5379 goto onError;
5380 /* generate replacement */
5381 repsize = PyUnicode_GET_SIZE(repunicode);
5382 if (charmaptranslate_makespace(&res, &str,
5383 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5384 Py_DECREF(repunicode);
5385 goto onError;
5386 }
5387 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5388 *str++ = *uni2;
5389 p = startp + newpos;
5390 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005391 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005392 }
5393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 /* Resize if we allocated to much */
5395 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005396 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 if (PyUnicode_Resize(&res, respos) < 0)
5398 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 }
5400 Py_XDECREF(exc);
5401 Py_XDECREF(errorHandler);
5402 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405 Py_XDECREF(res);
5406 Py_XDECREF(exc);
5407 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 return NULL;
5409}
5410
5411PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 PyObject *mapping,
5413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
5415 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005416
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 str = PyUnicode_FromObject(str);
5418 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 PyUnicode_GET_SIZE(str),
5422 mapping,
5423 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 Py_DECREF(str);
5425 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 Py_XDECREF(str);
5429 return NULL;
5430}
Tim Petersced69f82003-09-16 20:30:58 +00005431
Guido van Rossum9e896b32000-04-05 20:11:21 +00005432/* --- Decimal Encoder ---------------------------------------------------- */
5433
5434int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 Py_ssize_t length,
5436 char *output,
5437 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005438{
5439 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 PyObject *errorHandler = NULL;
5441 PyObject *exc = NULL;
5442 const char *encoding = "decimal";
5443 const char *reason = "invalid decimal Unicode string";
5444 /* the following variable is used for caching string comparisons
5445 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5446 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005447
5448 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 PyErr_BadArgument();
5450 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005451 }
5452
5453 p = s;
5454 end = s + length;
5455 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 register Py_UNICODE ch = *p;
5457 int decimal;
5458 PyObject *repunicode;
5459 Py_ssize_t repsize;
5460 Py_ssize_t newpos;
5461 Py_UNICODE *uni2;
5462 Py_UNICODE *collstart;
5463 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005464
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005466 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 ++p;
5468 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 decimal = Py_UNICODE_TODECIMAL(ch);
5471 if (decimal >= 0) {
5472 *output++ = '0' + decimal;
5473 ++p;
5474 continue;
5475 }
5476 if (0 < ch && ch < 256) {
5477 *output++ = (char)ch;
5478 ++p;
5479 continue;
5480 }
5481 /* All other characters are considered unencodable */
5482 collstart = p;
5483 collend = p+1;
5484 while (collend < end) {
5485 if ((0 < *collend && *collend < 256) ||
5486 !Py_UNICODE_ISSPACE(*collend) ||
5487 Py_UNICODE_TODECIMAL(*collend))
5488 break;
5489 }
5490 /* cache callback name lookup
5491 * (if not done yet, i.e. it's the first error) */
5492 if (known_errorHandler==-1) {
5493 if ((errors==NULL) || (!strcmp(errors, "strict")))
5494 known_errorHandler = 1;
5495 else if (!strcmp(errors, "replace"))
5496 known_errorHandler = 2;
5497 else if (!strcmp(errors, "ignore"))
5498 known_errorHandler = 3;
5499 else if (!strcmp(errors, "xmlcharrefreplace"))
5500 known_errorHandler = 4;
5501 else
5502 known_errorHandler = 0;
5503 }
5504 switch (known_errorHandler) {
5505 case 1: /* strict */
5506 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5507 goto onError;
5508 case 2: /* replace */
5509 for (p = collstart; p < collend; ++p)
5510 *output++ = '?';
5511 /* fall through */
5512 case 3: /* ignore */
5513 p = collend;
5514 break;
5515 case 4: /* xmlcharrefreplace */
5516 /* generate replacement (temporarily (mis)uses p) */
5517 for (p = collstart; p < collend; ++p)
5518 output += sprintf(output, "&#%d;", (int)*p);
5519 p = collend;
5520 break;
5521 default:
5522 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5523 encoding, reason, s, length, &exc,
5524 collstart-s, collend-s, &newpos);
5525 if (repunicode == NULL)
5526 goto onError;
5527 /* generate replacement */
5528 repsize = PyUnicode_GET_SIZE(repunicode);
5529 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5530 Py_UNICODE ch = *uni2;
5531 if (Py_UNICODE_ISSPACE(ch))
5532 *output++ = ' ';
5533 else {
5534 decimal = Py_UNICODE_TODECIMAL(ch);
5535 if (decimal >= 0)
5536 *output++ = '0' + decimal;
5537 else if (0 < ch && ch < 256)
5538 *output++ = (char)ch;
5539 else {
5540 Py_DECREF(repunicode);
5541 raise_encode_exception(&exc, encoding,
5542 s, length, collstart-s, collend-s, reason);
5543 goto onError;
5544 }
5545 }
5546 }
5547 p = s + newpos;
5548 Py_DECREF(repunicode);
5549 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005550 }
5551 /* 0-terminate the output string */
5552 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 Py_XDECREF(exc);
5554 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005555 return 0;
5556
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 Py_XDECREF(exc);
5559 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005560 return -1;
5561}
5562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563/* --- Helpers ------------------------------------------------------------ */
5564
Eric Smith8c663262007-08-25 02:26:07 +00005565#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005566#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005567#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005568/* Include _ParseTupleFinds from find.h */
5569#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005570#include "stringlib/find.h"
5571#include "stringlib/partition.h"
5572
Eric Smith5807c412008-05-11 21:00:57 +00005573#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5574#include "stringlib/localeutil.h"
5575
Thomas Wouters477c8d52006-05-27 19:21:47 +00005576/* helper macro to fixup start/end slice values */
5577#define FIX_START_END(obj) \
5578 if (start < 0) \
5579 start += (obj)->length; \
5580 if (start < 0) \
5581 start = 0; \
5582 if (end > (obj)->length) \
5583 end = (obj)->length; \
5584 if (end < 0) \
5585 end += (obj)->length; \
5586 if (end < 0) \
5587 end = 0;
5588
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 PyObject *substr,
5591 Py_ssize_t start,
5592 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005594 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005595 PyUnicodeObject* str_obj;
5596 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005597
Thomas Wouters477c8d52006-05-27 19:21:47 +00005598 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5599 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005601 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5602 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 Py_DECREF(str_obj);
5604 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 }
Tim Petersced69f82003-09-16 20:30:58 +00005606
Thomas Wouters477c8d52006-05-27 19:21:47 +00005607 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005608
Thomas Wouters477c8d52006-05-27 19:21:47 +00005609 result = stringlib_count(
5610 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5611 );
5612
5613 Py_DECREF(sub_obj);
5614 Py_DECREF(str_obj);
5615
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 return result;
5617}
5618
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005620 PyObject *sub,
5621 Py_ssize_t start,
5622 Py_ssize_t end,
5623 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005625 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005628 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005630 sub = PyUnicode_FromObject(sub);
5631 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 Py_DECREF(str);
5633 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 }
Tim Petersced69f82003-09-16 20:30:58 +00005635
Thomas Wouters477c8d52006-05-27 19:21:47 +00005636 if (direction > 0)
5637 result = stringlib_find_slice(
5638 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5639 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5640 start, end
5641 );
5642 else
5643 result = stringlib_rfind_slice(
5644 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5645 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5646 start, end
5647 );
5648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005650 Py_DECREF(sub);
5651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 return result;
5653}
5654
Tim Petersced69f82003-09-16 20:30:58 +00005655static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 PyUnicodeObject *substring,
5658 Py_ssize_t start,
5659 Py_ssize_t end,
5660 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 if (substring->length == 0)
5663 return 1;
5664
Thomas Wouters477c8d52006-05-27 19:21:47 +00005665 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
5667 end -= substring->length;
5668 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
5671 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 if (Py_UNICODE_MATCH(self, end, substring))
5673 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 } else {
5675 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
5678
5679 return 0;
5680}
5681
Martin v. Löwis18e16552006-02-15 17:27:45 +00005682Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 PyObject *substr,
5684 Py_ssize_t start,
5685 Py_ssize_t end,
5686 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005688 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 str = PyUnicode_FromObject(str);
5691 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 substr = PyUnicode_FromObject(substr);
5694 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 Py_DECREF(str);
5696 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 }
Tim Petersced69f82003-09-16 20:30:58 +00005698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 (PyUnicodeObject *)substr,
5701 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 Py_DECREF(str);
5703 Py_DECREF(substr);
5704 return result;
5705}
5706
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707/* Apply fixfct filter to the Unicode object self and return a
5708 reference to the modified object */
5709
Tim Petersced69f82003-09-16 20:30:58 +00005710static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713{
5714
5715 PyUnicodeObject *u;
5716
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005717 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005720
5721 Py_UNICODE_COPY(u->str, self->str, self->length);
5722
Tim Peters7a29bd52001-09-12 03:03:31 +00005723 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 /* fixfct should return TRUE if it modified the buffer. If
5725 FALSE, return a reference to the original buffer instead
5726 (to save space, not time) */
5727 Py_INCREF(self);
5728 Py_DECREF(u);
5729 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 }
5731 return (PyObject*) u;
5732}
5733
Tim Petersced69f82003-09-16 20:30:58 +00005734static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735int fixupper(PyUnicodeObject *self)
5736{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005737 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 Py_UNICODE *s = self->str;
5739 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005740
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005743
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 ch = Py_UNICODE_TOUPPER(*s);
5745 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 *s = ch;
5748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 s++;
5750 }
5751
5752 return status;
5753}
5754
Tim Petersced69f82003-09-16 20:30:58 +00005755static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756int fixlower(PyUnicodeObject *self)
5757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 Py_UNICODE *s = self->str;
5760 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005764
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 ch = Py_UNICODE_TOLOWER(*s);
5766 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 *s = ch;
5769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 s++;
5771 }
5772
5773 return status;
5774}
5775
Tim Petersced69f82003-09-16 20:30:58 +00005776static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777int fixswapcase(PyUnicodeObject *self)
5778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 Py_UNICODE *s = self->str;
5781 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005782
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 while (len-- > 0) {
5784 if (Py_UNICODE_ISUPPER(*s)) {
5785 *s = Py_UNICODE_TOLOWER(*s);
5786 status = 1;
5787 } else if (Py_UNICODE_ISLOWER(*s)) {
5788 *s = Py_UNICODE_TOUPPER(*s);
5789 status = 1;
5790 }
5791 s++;
5792 }
5793
5794 return status;
5795}
5796
Tim Petersced69f82003-09-16 20:30:58 +00005797static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798int fixcapitalize(PyUnicodeObject *self)
5799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005801 Py_UNICODE *s = self->str;
5802 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005803
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005804 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005806 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 *s = Py_UNICODE_TOUPPER(*s);
5808 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005810 s++;
5811 while (--len > 0) {
5812 if (Py_UNICODE_ISUPPER(*s)) {
5813 *s = Py_UNICODE_TOLOWER(*s);
5814 status = 1;
5815 }
5816 s++;
5817 }
5818 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819}
5820
5821static
5822int fixtitle(PyUnicodeObject *self)
5823{
5824 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5825 register Py_UNICODE *e;
5826 int previous_is_cased;
5827
5828 /* Shortcut for single character strings */
5829 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5831 if (*p != ch) {
5832 *p = ch;
5833 return 1;
5834 }
5835 else
5836 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 }
Tim Petersced69f82003-09-16 20:30:58 +00005838
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 e = p + PyUnicode_GET_SIZE(self);
5840 previous_is_cased = 0;
5841 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005843
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 if (previous_is_cased)
5845 *p = Py_UNICODE_TOLOWER(ch);
5846 else
5847 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005848
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 if (Py_UNICODE_ISLOWER(ch) ||
5850 Py_UNICODE_ISUPPER(ch) ||
5851 Py_UNICODE_ISTITLE(ch))
5852 previous_is_cased = 1;
5853 else
5854 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 }
5856 return 1;
5857}
5858
Tim Peters8ce9f162004-08-27 01:49:32 +00005859PyObject *
5860PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Skip Montanaro6543b452004-09-16 03:28:13 +00005862 const Py_UNICODE blank = ' ';
5863 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005864 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005865 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005866 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5867 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005868 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5869 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005870 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005871 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
Tim Peters05eba1f2004-08-27 21:32:02 +00005873 fseq = PySequence_Fast(seq, "");
5874 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005875 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005876 }
5877
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005878 /* NOTE: the following code can't call back into Python code,
5879 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005880 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005881
Tim Peters05eba1f2004-08-27 21:32:02 +00005882 seqlen = PySequence_Fast_GET_SIZE(fseq);
5883 /* If empty sequence, return u"". */
5884 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005885 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5886 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005887 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005888 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005889 /* If singleton sequence with an exact Unicode, return that. */
5890 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 item = items[0];
5892 if (PyUnicode_CheckExact(item)) {
5893 Py_INCREF(item);
5894 res = (PyUnicodeObject *)item;
5895 goto Done;
5896 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005897 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005898 else {
5899 /* Set up sep and seplen */
5900 if (separator == NULL) {
5901 sep = &blank;
5902 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005903 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005904 else {
5905 if (!PyUnicode_Check(separator)) {
5906 PyErr_Format(PyExc_TypeError,
5907 "separator: expected str instance,"
5908 " %.80s found",
5909 Py_TYPE(separator)->tp_name);
5910 goto onError;
5911 }
5912 sep = PyUnicode_AS_UNICODE(separator);
5913 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005914 }
5915 }
5916
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005917 /* There are at least two things to join, or else we have a subclass
5918 * of str in the sequence.
5919 * Do a pre-pass to figure out the total amount of space we'll
5920 * need (sz), and see whether all argument are strings.
5921 */
5922 sz = 0;
5923 for (i = 0; i < seqlen; i++) {
5924 const Py_ssize_t old_sz = sz;
5925 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 if (!PyUnicode_Check(item)) {
5927 PyErr_Format(PyExc_TypeError,
5928 "sequence item %zd: expected str instance,"
5929 " %.80s found",
5930 i, Py_TYPE(item)->tp_name);
5931 goto onError;
5932 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005933 sz += PyUnicode_GET_SIZE(item);
5934 if (i != 0)
5935 sz += seplen;
5936 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5937 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005939 goto onError;
5940 }
5941 }
Tim Petersced69f82003-09-16 20:30:58 +00005942
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005943 res = _PyUnicode_New(sz);
5944 if (res == NULL)
5945 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005946
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005947 /* Catenate everything. */
5948 res_p = PyUnicode_AS_UNICODE(res);
5949 for (i = 0; i < seqlen; ++i) {
5950 Py_ssize_t itemlen;
5951 item = items[i];
5952 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 /* Copy item, and maybe the separator. */
5954 if (i) {
5955 Py_UNICODE_COPY(res_p, sep, seplen);
5956 res_p += seplen;
5957 }
5958 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5959 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005960 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005963 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 return (PyObject *)res;
5965
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005967 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005968 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 return NULL;
5970}
5971
Tim Petersced69f82003-09-16 20:30:58 +00005972static
5973PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 Py_ssize_t left,
5975 Py_ssize_t right,
5976 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977{
5978 PyUnicodeObject *u;
5979
5980 if (left < 0)
5981 left = 0;
5982 if (right < 0)
5983 right = 0;
5984
Tim Peters7a29bd52001-09-12 03:03:31 +00005985 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 Py_INCREF(self);
5987 return self;
5988 }
5989
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005990 if (left > PY_SSIZE_T_MAX - self->length ||
5991 right > PY_SSIZE_T_MAX - (left + self->length)) {
5992 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5993 return NULL;
5994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 u = _PyUnicode_New(left + self->length + right);
5996 if (u) {
5997 if (left)
5998 Py_UNICODE_FILL(u->str, fill, left);
5999 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6000 if (right)
6001 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6002 }
6003
6004 return u;
6005}
6006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007#define SPLIT_APPEND(data, left, right) \
6008 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6009 if (!str) \
6010 goto onError; \
6011 if (PyList_Append(list, str)) { \
6012 Py_DECREF(str); \
6013 goto onError; \
6014 } \
6015 else \
6016 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
6018static
6019PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 PyObject *list,
6021 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 register Py_ssize_t i;
6024 register Py_ssize_t j;
6025 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006027 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006031 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006033 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6035 i++;
6036 if (j < i) {
6037 if (maxcount-- <= 0)
6038 break;
6039 SPLIT_APPEND(buf, j, i);
6040 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6041 i++;
6042 j = i;
6043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 }
6045 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
6048 return list;
6049
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 Py_DECREF(list);
6052 return NULL;
6053}
6054
6055PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006058 register Py_ssize_t i;
6059 register Py_ssize_t j;
6060 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 PyObject *list;
6062 PyObject *str;
6063 Py_UNICODE *data;
6064
6065 string = PyUnicode_FromObject(string);
6066 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 data = PyUnicode_AS_UNICODE(string);
6069 len = PyUnicode_GET_SIZE(string);
6070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 list = PyList_New(0);
6072 if (!list)
6073 goto onError;
6074
6075 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006077
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 /* Find a line and append it */
6079 while (i < len && !BLOOM_LINEBREAK(data[i]))
6080 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006083 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 if (i < len) {
6085 if (data[i] == '\r' && i + 1 < len &&
6086 data[i+1] == '\n')
6087 i += 2;
6088 else
6089 i++;
6090 if (keepends)
6091 eol = i;
6092 }
6093 SPLIT_APPEND(data, j, eol);
6094 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
6096 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
6099
6100 Py_DECREF(string);
6101 return list;
6102
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006104 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 Py_DECREF(string);
6106 return NULL;
6107}
6108
Tim Petersced69f82003-09-16 20:30:58 +00006109static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 PyObject *list,
6112 Py_UNICODE ch,
6113 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 register Py_ssize_t i;
6116 register Py_ssize_t j;
6117 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006119 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
6121 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 if (buf[i] == ch) {
6123 if (maxcount-- <= 0)
6124 break;
6125 SPLIT_APPEND(buf, j, i);
6126 i = j = i + 1;
6127 } else
6128 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 }
6130 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
6133 return list;
6134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 Py_DECREF(list);
6137 return NULL;
6138}
6139
Tim Petersced69f82003-09-16 20:30:58 +00006140static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 PyObject *list,
6143 PyUnicodeObject *substring,
6144 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006146 register Py_ssize_t i;
6147 register Py_ssize_t j;
6148 Py_ssize_t len = self->length;
6149 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 PyObject *str;
6151
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006152 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 if (Py_UNICODE_MATCH(self, i, substring)) {
6154 if (maxcount-- <= 0)
6155 break;
6156 SPLIT_APPEND(self->str, j, i);
6157 i = j = i + sublen;
6158 } else
6159 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
6161 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
6164 return list;
6165
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 Py_DECREF(list);
6168 return NULL;
6169}
6170
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006171static
6172PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 PyObject *list,
6174 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006175{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006176 register Py_ssize_t i;
6177 register Py_ssize_t j;
6178 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006179 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006180 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006181
6182 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006184 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006186 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6188 i--;
6189 if (j > i) {
6190 if (maxcount-- <= 0)
6191 break;
6192 SPLIT_APPEND(buf, i + 1, j + 1);
6193 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6194 i--;
6195 j = i;
6196 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006197 }
6198 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006200 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006201 if (PyList_Reverse(list) < 0)
6202 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006203 return list;
6204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006206 Py_DECREF(list);
6207 return NULL;
6208}
6209
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006211PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 PyObject *list,
6213 Py_UNICODE ch,
6214 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006216 register Py_ssize_t i;
6217 register Py_ssize_t j;
6218 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006219 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006220 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006221
6222 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 if (buf[i] == ch) {
6224 if (maxcount-- <= 0)
6225 break;
6226 SPLIT_APPEND(buf, i + 1, j + 1);
6227 j = i = i - 1;
6228 } else
6229 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006230 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006231 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006233 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006234 if (PyList_Reverse(list) < 0)
6235 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006236 return list;
6237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006239 Py_DECREF(list);
6240 return NULL;
6241}
6242
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006244PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 PyObject *list,
6246 PyUnicodeObject *substring,
6247 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006249 register Py_ssize_t i;
6250 register Py_ssize_t j;
6251 Py_ssize_t len = self->length;
6252 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006253 PyObject *str;
6254
6255 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 if (Py_UNICODE_MATCH(self, i, substring)) {
6257 if (maxcount-- <= 0)
6258 break;
6259 SPLIT_APPEND(self->str, i + sublen, j);
6260 j = i;
6261 i -= sublen;
6262 } else
6263 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006264 }
6265 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006267 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006268 if (PyList_Reverse(list) < 0)
6269 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006270 return list;
6271
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006273 Py_DECREF(list);
6274 return NULL;
6275}
6276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277#undef SPLIT_APPEND
6278
6279static
6280PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyUnicodeObject *substring,
6282 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283{
6284 PyObject *list;
6285
6286 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006287 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
6289 list = PyList_New(0);
6290 if (!list)
6291 return NULL;
6292
6293 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
6299 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 Py_DECREF(list);
6301 PyErr_SetString(PyExc_ValueError, "empty separator");
6302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 }
6304 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Tim Petersced69f82003-09-16 20:30:58 +00006308static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006309PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyUnicodeObject *substring,
6311 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006312{
6313 PyObject *list;
6314
6315 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006316 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006317
6318 list = PyList_New(0);
6319 if (!list)
6320 return NULL;
6321
6322 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006324
6325 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006327
6328 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 Py_DECREF(list);
6330 PyErr_SetString(PyExc_ValueError, "empty separator");
6331 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006332 }
6333 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006335}
6336
6337static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 PyUnicodeObject *str1,
6340 PyUnicodeObject *str2,
6341 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
6343 PyUnicodeObject *u;
6344
6345 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
Thomas Wouters477c8d52006-05-27 19:21:47 +00006348 if (str1->length == str2->length) {
6349 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006350 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006351 if (str1->length == 1) {
6352 /* replace characters */
6353 Py_UNICODE u1, u2;
6354 if (!findchar(self->str, self->length, str1->str[0]))
6355 goto nothing;
6356 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6357 if (!u)
6358 return NULL;
6359 Py_UNICODE_COPY(u->str, self->str, self->length);
6360 u1 = str1->str[0];
6361 u2 = str2->str[0];
6362 for (i = 0; i < u->length; i++)
6363 if (u->str[i] == u1) {
6364 if (--maxcount < 0)
6365 break;
6366 u->str[i] = u2;
6367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 i = fastsearch(
6370 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006372 if (i < 0)
6373 goto nothing;
6374 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6375 if (!u)
6376 return NULL;
6377 Py_UNICODE_COPY(u->str, self->str, self->length);
6378 while (i <= self->length - str1->length)
6379 if (Py_UNICODE_MATCH(self, i, str1)) {
6380 if (--maxcount < 0)
6381 break;
6382 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6383 i += str1->length;
6384 } else
6385 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388
6389 Py_ssize_t n, i, j, e;
6390 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 Py_UNICODE *p;
6392
6393 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 if (n > maxcount)
6396 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006397 if (n == 0)
6398 goto nothing;
6399 /* new_size = self->length + n * (str2->length - str1->length)); */
6400 delta = (str2->length - str1->length);
6401 if (delta == 0) {
6402 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 product = n * (str2->length - str1->length);
6405 if ((product / (str2->length - str1->length)) != n) {
6406 PyErr_SetString(PyExc_OverflowError,
6407 "replace string is too long");
6408 return NULL;
6409 }
6410 new_size = self->length + product;
6411 if (new_size < 0) {
6412 PyErr_SetString(PyExc_OverflowError,
6413 "replace string is too long");
6414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 }
6416 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006417 u = _PyUnicode_New(new_size);
6418 if (!u)
6419 return NULL;
6420 i = 0;
6421 p = u->str;
6422 e = self->length - str1->length;
6423 if (str1->length > 0) {
6424 while (n-- > 0) {
6425 /* look for next match */
6426 j = i;
6427 while (j <= e) {
6428 if (Py_UNICODE_MATCH(self, j, str1))
6429 break;
6430 j++;
6431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006433 if (j > e)
6434 break;
6435 /* copy unchanged part [i:j] */
6436 Py_UNICODE_COPY(p, self->str+i, j-i);
6437 p += j - i;
6438 }
6439 /* copy substitution string */
6440 if (str2->length > 0) {
6441 Py_UNICODE_COPY(p, str2->str, str2->length);
6442 p += str2->length;
6443 }
6444 i = j + str1->length;
6445 }
6446 if (i < self->length)
6447 /* copy tail [i:] */
6448 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6449 } else {
6450 /* interleave */
6451 while (n > 0) {
6452 Py_UNICODE_COPY(p, str2->str, str2->length);
6453 p += str2->length;
6454 if (--n <= 0)
6455 break;
6456 *p++ = self->str[i++];
6457 }
6458 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006462
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464 /* nothing to replace; return original string (when possible) */
6465 if (PyUnicode_CheckExact(self)) {
6466 Py_INCREF(self);
6467 return (PyObject *) self;
6468 }
6469 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
6472/* --- Unicode Object Methods --------------------------------------------- */
6473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006474PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476\n\
6477Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
6480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006481unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 return fixup(self, fixtitle);
6484}
6485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488\n\
6489Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
6492static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006493unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 return fixup(self, fixcapitalize);
6496}
6497
6498#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006499PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501\n\
6502Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006503normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504
6505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006506unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
6508 PyObject *list;
6509 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 /* Split into words */
6513 list = split(self, NULL, -1);
6514 if (!list)
6515 return NULL;
6516
6517 /* Capitalize each word */
6518 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6519 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 if (item == NULL)
6522 goto onError;
6523 Py_DECREF(PyList_GET_ITEM(list, i));
6524 PyList_SET_ITEM(list, i, item);
6525 }
6526
6527 /* Join the words to form a new string */
6528 item = PyUnicode_Join(NULL, list);
6529
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 Py_DECREF(list);
6532 return (PyObject *)item;
6533}
6534#endif
6535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006536/* Argument converter. Coerces to a single unicode character */
6537
6538static int
6539convert_uc(PyObject *obj, void *addr)
6540{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006541 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6542 PyObject *uniobj;
6543 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006544
Benjamin Peterson14339b62009-01-31 16:36:08 +00006545 uniobj = PyUnicode_FromObject(obj);
6546 if (uniobj == NULL) {
6547 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006549 return 0;
6550 }
6551 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6552 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006554 Py_DECREF(uniobj);
6555 return 0;
6556 }
6557 unistr = PyUnicode_AS_UNICODE(uniobj);
6558 *fillcharloc = unistr[0];
6559 Py_DECREF(uniobj);
6560 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006566Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006567done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject *
6570unicode_center(PyUnicodeObject *self, PyObject *args)
6571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t marg, left;
6573 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006574 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
Thomas Woutersde017742006-02-16 19:34:37 +00006576 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 return NULL;
6578
Tim Peters7a29bd52001-09-12 03:03:31 +00006579 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 Py_INCREF(self);
6581 return (PyObject*) self;
6582 }
6583
6584 marg = width - self->length;
6585 left = marg / 2 + (marg & width & 1);
6586
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006587 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588}
6589
Marc-André Lemburge5034372000-08-08 08:04:29 +00006590#if 0
6591
6592/* This code should go into some future Unicode collation support
6593 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006594 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006595
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006596/* speedy UTF-16 code point order comparison */
6597/* gleaned from: */
6598/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6599
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006600static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006601{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006602 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006603 0, 0, 0, 0, 0, 0, 0, 0,
6604 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006605 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006606};
6607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608static int
6609unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6610{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006611 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 Py_UNICODE *s1 = str1->str;
6614 Py_UNICODE *s2 = str2->str;
6615
6616 len1 = str1->length;
6617 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006620 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006621
6622 c1 = *s1++;
6623 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006624
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 if (c1 > (1<<11) * 26)
6626 c1 += utf16Fixup[c1>>11];
6627 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006628 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006629 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006630
6631 if (c1 != c2)
6632 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006633
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006634 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
6636
6637 return (len1 < len2) ? -1 : (len1 != len2);
6638}
6639
Marc-André Lemburge5034372000-08-08 08:04:29 +00006640#else
6641
6642static int
6643unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6644{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006646
6647 Py_UNICODE *s1 = str1->str;
6648 Py_UNICODE *s2 = str2->str;
6649
6650 len1 = str1->length;
6651 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006652
Marc-André Lemburge5034372000-08-08 08:04:29 +00006653 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006654 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006655
Fredrik Lundh45714e92001-06-26 16:39:36 +00006656 c1 = *s1++;
6657 c2 = *s2++;
6658
6659 if (c1 != c2)
6660 return (c1 < c2) ? -1 : 1;
6661
Marc-André Lemburge5034372000-08-08 08:04:29 +00006662 len1--; len2--;
6663 }
6664
6665 return (len1 < len2) ? -1 : (len1 != len2);
6666}
6667
6668#endif
6669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006673 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6674 return unicode_compare((PyUnicodeObject *)left,
6675 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006676 PyErr_Format(PyExc_TypeError,
6677 "Can't compare %.100s and %.100s",
6678 left->ob_type->tp_name,
6679 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return -1;
6681}
6682
Martin v. Löwis5b222132007-06-10 09:51:05 +00006683int
6684PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6685{
6686 int i;
6687 Py_UNICODE *id;
6688 assert(PyUnicode_Check(uni));
6689 id = PyUnicode_AS_UNICODE(uni);
6690 /* Compare Unicode string and source character set string */
6691 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 if (id[i] != str[i])
6693 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006694 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006696 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006698 return 0;
6699}
6700
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006701
Benjamin Peterson29060642009-01-31 22:14:21 +00006702#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006704
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006705PyObject *PyUnicode_RichCompare(PyObject *left,
6706 PyObject *right,
6707 int op)
6708{
6709 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006710
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006711 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6712 PyObject *v;
6713 if (((PyUnicodeObject *) left)->length !=
6714 ((PyUnicodeObject *) right)->length) {
6715 if (op == Py_EQ) {
6716 Py_INCREF(Py_False);
6717 return Py_False;
6718 }
6719 if (op == Py_NE) {
6720 Py_INCREF(Py_True);
6721 return Py_True;
6722 }
6723 }
6724 if (left == right)
6725 result = 0;
6726 else
6727 result = unicode_compare((PyUnicodeObject *)left,
6728 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006729
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006730 /* Convert the return value to a Boolean */
6731 switch (op) {
6732 case Py_EQ:
6733 v = TEST_COND(result == 0);
6734 break;
6735 case Py_NE:
6736 v = TEST_COND(result != 0);
6737 break;
6738 case Py_LE:
6739 v = TEST_COND(result <= 0);
6740 break;
6741 case Py_GE:
6742 v = TEST_COND(result >= 0);
6743 break;
6744 case Py_LT:
6745 v = TEST_COND(result == -1);
6746 break;
6747 case Py_GT:
6748 v = TEST_COND(result == 1);
6749 break;
6750 default:
6751 PyErr_BadArgument();
6752 return NULL;
6753 }
6754 Py_INCREF(v);
6755 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006756 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006758 Py_INCREF(Py_NotImplemented);
6759 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006760}
6761
Guido van Rossum403d68b2000-03-13 15:55:09 +00006762int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006764{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006765 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006767
6768 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006769 sub = PyUnicode_FromObject(element);
6770 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 PyErr_Format(PyExc_TypeError,
6772 "'in <string>' requires string as left operand, not %s",
6773 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006774 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006775 }
6776
Thomas Wouters477c8d52006-05-27 19:21:47 +00006777 str = PyUnicode_FromObject(container);
6778 if (!str) {
6779 Py_DECREF(sub);
6780 return -1;
6781 }
6782
6783 result = stringlib_contains_obj(str, sub);
6784
6785 Py_DECREF(str);
6786 Py_DECREF(sub);
6787
Guido van Rossum403d68b2000-03-13 15:55:09 +00006788 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006789}
6790
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791/* Concat to string or Unicode object giving a new Unicode object. */
6792
6793PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795{
6796 PyUnicodeObject *u = NULL, *v = NULL, *w;
6797
6798 /* Coerce the two arguments */
6799 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6800 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6803 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806 /* Shortcuts */
6807 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 Py_DECREF(v);
6809 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 }
6811 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 Py_DECREF(u);
6813 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 }
6815
6816 /* Concat the two Unicode strings */
6817 w = _PyUnicode_New(u->length + v->length);
6818 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 Py_UNICODE_COPY(w->str, u->str, u->length);
6821 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6822
6823 Py_DECREF(u);
6824 Py_DECREF(v);
6825 return (PyObject *)w;
6826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 Py_XDECREF(u);
6829 Py_XDECREF(v);
6830 return NULL;
6831}
6832
Walter Dörwald1ab83302007-05-18 17:15:44 +00006833void
6834PyUnicode_Append(PyObject **pleft, PyObject *right)
6835{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836 PyObject *new;
6837 if (*pleft == NULL)
6838 return;
6839 if (right == NULL || !PyUnicode_Check(*pleft)) {
6840 Py_DECREF(*pleft);
6841 *pleft = NULL;
6842 return;
6843 }
6844 new = PyUnicode_Concat(*pleft, right);
6845 Py_DECREF(*pleft);
6846 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006847}
6848
6849void
6850PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6851{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852 PyUnicode_Append(pleft, right);
6853 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006854}
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006859Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006860string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject *
6864unicode_count(PyUnicodeObject *self, PyObject *args)
6865{
6866 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006867 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006868 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 PyObject *result;
6870
Guido van Rossumb8872e62000-05-09 14:14:27 +00006871 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 return NULL;
6874
6875 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006876 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006879
Thomas Wouters477c8d52006-05-27 19:21:47 +00006880 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
Christian Heimes217cfd12007-12-02 14:31:20 +00006882 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006883 stringlib_count(self->str + start, end - start,
6884 substring->str, substring->length)
6885 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
6887 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 return result;
6890}
6891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006895Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006896to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006897handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6899'xmlcharrefreplace' as well as any other name registered with\n\
6900codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902static PyObject *
6903unicode_encode(PyUnicodeObject *self, PyObject *args)
6904{
6905 char *encoding = NULL;
6906 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006907 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6910 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006911 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006912 if (v == NULL)
6913 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006914 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006915 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006916 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006917 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006918 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006919 Py_DECREF(v);
6920 return NULL;
6921 }
6922 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006923
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006925 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006926}
6927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930\n\
6931Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006932If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
6934static PyObject*
6935unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6936{
6937 Py_UNICODE *e;
6938 Py_UNICODE *p;
6939 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006940 Py_UNICODE *qe;
6941 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 PyUnicodeObject *u;
6943 int tabsize = 8;
6944
6945 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
Thomas Wouters7e474022000-07-16 12:04:32 +00006948 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006949 i = 0; /* chars up to and including most recent \n or \r */
6950 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6951 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 for (p = self->str; p < e; p++)
6953 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 if (tabsize > 0) {
6955 incr = tabsize - (j % tabsize); /* cannot overflow */
6956 if (j > PY_SSIZE_T_MAX - incr)
6957 goto overflow1;
6958 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 if (j > PY_SSIZE_T_MAX - 1)
6963 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 j++;
6965 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 if (i > PY_SSIZE_T_MAX - j)
6967 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006969 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 }
6971 }
6972
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006973 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 /* Second pass: create output string and fill it */
6977 u = _PyUnicode_New(i + j);
6978 if (!u)
6979 return NULL;
6980
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006981 j = 0; /* same as in first pass */
6982 q = u->str; /* next output char */
6983 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
6985 for (p = self->str; p < e; p++)
6986 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 if (tabsize > 0) {
6988 i = tabsize - (j % tabsize);
6989 j += i;
6990 while (i--) {
6991 if (q >= qe)
6992 goto overflow2;
6993 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006996 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 else {
6998 if (q >= qe)
6999 goto overflow2;
7000 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007001 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 if (*p == '\n' || *p == '\r')
7003 j = 0;
7004 }
7005
7006 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007007
7008 overflow2:
7009 Py_DECREF(u);
7010 overflow1:
7011 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007015PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017\n\
7018Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007019such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020arguments start and end are interpreted as in slice notation.\n\
7021\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007022Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
7024static PyObject *
7025unicode_find(PyUnicodeObject *self, PyObject *args)
7026{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007027 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007028 Py_ssize_t start;
7029 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007030 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
Christian Heimes9cd17752007-11-18 19:35:23 +00007032 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
Thomas Wouters477c8d52006-05-27 19:21:47 +00007035 result = stringlib_find_slice(
7036 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7037 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7038 start, end
7039 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042
Christian Heimes217cfd12007-12-02 14:31:20 +00007043 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044}
7045
7046static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007047unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
7049 if (index < 0 || index >= self->length) {
7050 PyErr_SetString(PyExc_IndexError, "string index out of range");
7051 return NULL;
7052 }
7053
7054 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7055}
7056
Guido van Rossumc2504932007-09-18 19:42:40 +00007057/* Believe it or not, this produces the same value for ASCII strings
7058 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007060unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
Guido van Rossumc2504932007-09-18 19:42:40 +00007062 Py_ssize_t len;
7063 Py_UNICODE *p;
7064 long x;
7065
7066 if (self->hash != -1)
7067 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007068 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007069 p = self->str;
7070 x = *p << 7;
7071 while (--len >= 0)
7072 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007073 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007074 if (x == -1)
7075 x = -2;
7076 self->hash = x;
7077 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078}
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
7085static PyObject *
7086unicode_index(PyUnicodeObject *self, PyObject *args)
7087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007089 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007090 Py_ssize_t start;
7091 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
Christian Heimes9cd17752007-11-18 19:35:23 +00007093 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096 result = stringlib_find_slice(
7097 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7098 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7099 start, end
7100 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101
7102 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 if (result < 0) {
7105 PyErr_SetString(PyExc_ValueError, "substring not found");
7106 return NULL;
7107 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007108
Christian Heimes217cfd12007-12-02 14:31:20 +00007109 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007115Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
7121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7122 register const Py_UNICODE *e;
7123 int cased;
7124
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 /* Shortcut for single character strings */
7126 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007129 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007130 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007132
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 e = p + PyUnicode_GET_SIZE(self);
7134 cased = 0;
7135 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007137
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7139 return PyBool_FromLong(0);
7140 else if (!cased && Py_UNICODE_ISLOWER(ch))
7141 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007143 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144}
7145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007146PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007149Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007153unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7156 register const Py_UNICODE *e;
7157 int cased;
7158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 /* Shortcut for single character strings */
7160 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007163 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007164 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 e = p + PyUnicode_GET_SIZE(self);
7168 cased = 0;
7169 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007171
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7173 return PyBool_FromLong(0);
7174 else if (!cased && Py_UNICODE_ISUPPER(ch))
7175 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007183Return True if S is a titlecased string and there is at least one\n\
7184character in S, i.e. upper- and titlecase characters may only\n\
7185follow uncased characters and lowercase characters only cased ones.\n\
7186Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187
7188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007189unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190{
7191 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7192 register const Py_UNICODE *e;
7193 int cased, previous_is_cased;
7194
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 /* Shortcut for single character strings */
7196 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7198 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007200 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007201 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007203
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 e = p + PyUnicode_GET_SIZE(self);
7205 cased = 0;
7206 previous_is_cased = 0;
7207 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007209
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7211 if (previous_is_cased)
7212 return PyBool_FromLong(0);
7213 previous_is_cased = 1;
7214 cased = 1;
7215 }
7216 else if (Py_UNICODE_ISLOWER(ch)) {
7217 if (!previous_is_cased)
7218 return PyBool_FromLong(0);
7219 previous_is_cased = 1;
7220 cased = 1;
7221 }
7222 else
7223 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007225 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007228PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007231Return True if all characters in S are whitespace\n\
7232and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233
7234static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007235unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236{
7237 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7238 register const Py_UNICODE *e;
7239
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 /* Shortcut for single character strings */
7241 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 Py_UNICODE_ISSPACE(*p))
7243 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007245 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007246 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007248
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 e = p + PyUnicode_GET_SIZE(self);
7250 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 if (!Py_UNICODE_ISSPACE(*p))
7252 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007254 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255}
7256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007257PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007260Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007262
7263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007264unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007265{
7266 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7267 register const Py_UNICODE *e;
7268
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007269 /* Shortcut for single character strings */
7270 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 Py_UNICODE_ISALPHA(*p))
7272 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007273
7274 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007275 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007277
7278 e = p + PyUnicode_GET_SIZE(self);
7279 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 if (!Py_UNICODE_ISALPHA(*p))
7281 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007282 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007283 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007284}
7285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007286PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007288\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007289Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007291
7292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007293unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007294{
7295 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7296 register const Py_UNICODE *e;
7297
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007298 /* Shortcut for single character strings */
7299 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 Py_UNICODE_ISALNUM(*p))
7301 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007302
7303 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007304 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007306
7307 e = p + PyUnicode_GET_SIZE(self);
7308 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 if (!Py_UNICODE_ISALNUM(*p))
7310 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007311 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007312 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007313}
7314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007315PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007318Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
7321static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007322unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323{
7324 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7325 register const Py_UNICODE *e;
7326
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 /* Shortcut for single character strings */
7328 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 Py_UNICODE_ISDECIMAL(*p))
7330 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007332 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007333 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007335
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 e = p + PyUnicode_GET_SIZE(self);
7337 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 if (!Py_UNICODE_ISDECIMAL(*p))
7339 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342}
7343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007344PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007347Return True if all characters in S are digits\n\
7348and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
7350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007351unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352{
7353 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7354 register const Py_UNICODE *e;
7355
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 /* Shortcut for single character strings */
7357 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 Py_UNICODE_ISDIGIT(*p))
7359 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007361 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007362 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 e = p + PyUnicode_GET_SIZE(self);
7366 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 if (!Py_UNICODE_ISDIGIT(*p))
7368 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007370 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007376Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007377False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
7379static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007380unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
7382 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7383 register const Py_UNICODE *e;
7384
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 /* Shortcut for single character strings */
7386 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 Py_UNICODE_ISNUMERIC(*p))
7388 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007390 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007391 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007393
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 e = p + PyUnicode_GET_SIZE(self);
7395 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 if (!Py_UNICODE_ISNUMERIC(*p))
7397 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007399 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Martin v. Löwis47383402007-08-15 07:32:56 +00007402int
7403PyUnicode_IsIdentifier(PyObject *self)
7404{
7405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7406 register const Py_UNICODE *e;
7407
7408 /* Special case for empty strings */
7409 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007411
7412 /* PEP 3131 says that the first character must be in
7413 XID_Start and subsequent characters in XID_Continue,
7414 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007415 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007416 letters, digits, underscore). However, given the current
7417 definition of XID_Start and XID_Continue, it is sufficient
7418 to check just for these, except that _ must be allowed
7419 as starting an identifier. */
7420 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7421 return 0;
7422
7423 e = p + PyUnicode_GET_SIZE(self);
7424 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 if (!_PyUnicode_IsXidContinue(*p))
7426 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007427 }
7428 return 1;
7429}
7430
7431PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007433\n\
7434Return True if S is a valid identifier according\n\
7435to the language definition.");
7436
7437static PyObject*
7438unicode_isidentifier(PyObject *self)
7439{
7440 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7441}
7442
Georg Brandl559e5d72008-06-11 18:37:52 +00007443PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007445\n\
7446Return True if all characters in S are considered\n\
7447printable in repr() or S is empty, False otherwise.");
7448
7449static PyObject*
7450unicode_isprintable(PyObject *self)
7451{
7452 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7453 register const Py_UNICODE *e;
7454
7455 /* Shortcut for single character strings */
7456 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7457 Py_RETURN_TRUE;
7458 }
7459
7460 e = p + PyUnicode_GET_SIZE(self);
7461 for (; p < e; p++) {
7462 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7463 Py_RETURN_FALSE;
7464 }
7465 }
7466 Py_RETURN_TRUE;
7467}
7468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007469PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471\n\
7472Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
7475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007476unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007478 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482unicode_length(PyUnicodeObject *self)
7483{
7484 return self->length;
7485}
7486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007487PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007490Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007491done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
7493static PyObject *
7494unicode_ljust(PyUnicodeObject *self, PyObject *args)
7495{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007496 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007497 Py_UNICODE fillchar = ' ';
7498
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007499 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 return NULL;
7501
Tim Peters7a29bd52001-09-12 03:03:31 +00007502 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 Py_INCREF(self);
7504 return (PyObject*) self;
7505 }
7506
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007507 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508}
7509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007513Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007516unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 return fixup(self, fixlower);
7519}
7520
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007521#define LEFTSTRIP 0
7522#define RIGHTSTRIP 1
7523#define BOTHSTRIP 2
7524
7525/* Arrays indexed by above */
7526static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7527
7528#define STRIPNAME(i) (stripformat[i]+3)
7529
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007530/* externally visible for str.strip(unicode) */
7531PyObject *
7532_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7533{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007534 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7535 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7536 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7537 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7538 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007539
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007541
Benjamin Peterson14339b62009-01-31 16:36:08 +00007542 i = 0;
7543 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7545 i++;
7546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007547 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007548
Benjamin Peterson14339b62009-01-31 16:36:08 +00007549 j = len;
7550 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 do {
7552 j--;
7553 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7554 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007555 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007556
Benjamin Peterson14339b62009-01-31 16:36:08 +00007557 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 Py_INCREF(self);
7559 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007560 }
7561 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007563}
7564
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
7566static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007567do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007569 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7570 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007571
Benjamin Peterson14339b62009-01-31 16:36:08 +00007572 i = 0;
7573 if (striptype != RIGHTSTRIP) {
7574 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7575 i++;
7576 }
7577 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007578
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 j = len;
7580 if (striptype != LEFTSTRIP) {
7581 do {
7582 j--;
7583 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7584 j++;
7585 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007586
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7588 Py_INCREF(self);
7589 return (PyObject*)self;
7590 }
7591 else
7592 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007595
7596static PyObject *
7597do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7598{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007599 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007600
Benjamin Peterson14339b62009-01-31 16:36:08 +00007601 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7602 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007603
Benjamin Peterson14339b62009-01-31 16:36:08 +00007604 if (sep != NULL && sep != Py_None) {
7605 if (PyUnicode_Check(sep))
7606 return _PyUnicode_XStrip(self, striptype, sep);
7607 else {
7608 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 "%s arg must be None or str",
7610 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 return NULL;
7612 }
7613 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007614
Benjamin Peterson14339b62009-01-31 16:36:08 +00007615 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007616}
7617
7618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007621\n\
7622Return a copy of the string S with leading and trailing\n\
7623whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007624If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007625
7626static PyObject *
7627unicode_strip(PyUnicodeObject *self, PyObject *args)
7628{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 if (PyTuple_GET_SIZE(args) == 0)
7630 return do_strip(self, BOTHSTRIP); /* Common case */
7631 else
7632 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007633}
7634
7635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007638\n\
7639Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007640If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007641
7642static PyObject *
7643unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7644{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007645 if (PyTuple_GET_SIZE(args) == 0)
7646 return do_strip(self, LEFTSTRIP); /* Common case */
7647 else
7648 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007649}
7650
7651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007654\n\
7655Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007656If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007657
7658static PyObject *
7659unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007661 if (PyTuple_GET_SIZE(args) == 0)
7662 return do_strip(self, RIGHTSTRIP); /* Common case */
7663 else
7664 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665}
7666
7667
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670{
7671 PyUnicodeObject *u;
7672 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007674 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676 if (len < 0)
7677 len = 0;
7678
Tim Peters7a29bd52001-09-12 03:03:31 +00007679 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 /* no repeat, return original string */
7681 Py_INCREF(str);
7682 return (PyObject*) str;
7683 }
Tim Peters8f422462000-09-09 06:13:41 +00007684
7685 /* ensure # of chars needed doesn't overflow int and # of bytes
7686 * needed doesn't overflow size_t
7687 */
7688 nchars = len * str->length;
7689 if (len && nchars / len != str->length) {
7690 PyErr_SetString(PyExc_OverflowError,
7691 "repeated string is too long");
7692 return NULL;
7693 }
7694 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7695 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7696 PyErr_SetString(PyExc_OverflowError,
7697 "repeated string is too long");
7698 return NULL;
7699 }
7700 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 if (!u)
7702 return NULL;
7703
7704 p = u->str;
7705
Thomas Wouters477c8d52006-05-27 19:21:47 +00007706 if (str->length == 1 && len > 0) {
7707 Py_UNICODE_FILL(p, str->str[0], len);
7708 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 Py_ssize_t done = 0; /* number of characters copied this far */
7710 if (done < nchars) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007711 Py_UNICODE_COPY(p, str->str, str->length);
7712 done = str->length;
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 }
7714 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007715 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007716 Py_UNICODE_COPY(p+done, p, n);
7717 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 }
7720
7721 return (PyObject*) u;
7722}
7723
7724PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 PyObject *subobj,
7726 PyObject *replobj,
7727 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728{
7729 PyObject *self;
7730 PyObject *str1;
7731 PyObject *str2;
7732 PyObject *result;
7733
7734 self = PyUnicode_FromObject(obj);
7735 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 str1 = PyUnicode_FromObject(subobj);
7738 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 Py_DECREF(self);
7740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 }
7742 str2 = PyUnicode_FromObject(replobj);
7743 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 Py_DECREF(self);
7745 Py_DECREF(str1);
7746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 }
Tim Petersced69f82003-09-16 20:30:58 +00007748 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 (PyUnicodeObject *)str1,
7750 (PyUnicodeObject *)str2,
7751 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 Py_DECREF(self);
7753 Py_DECREF(str1);
7754 Py_DECREF(str2);
7755 return result;
7756}
7757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760\n\
7761Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007762old replaced by new. If the optional argument count is\n\
7763given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
7765static PyObject*
7766unicode_replace(PyUnicodeObject *self, PyObject *args)
7767{
7768 PyUnicodeObject *str1;
7769 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007770 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 PyObject *result;
7772
Martin v. Löwis18e16552006-02-15 17:27:45 +00007773 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 return NULL;
7775 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7776 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007779 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 Py_DECREF(str1);
7781 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783
7784 result = replace(self, str1, str2, maxcount);
7785
7786 Py_DECREF(str1);
7787 Py_DECREF(str2);
7788 return result;
7789}
7790
7791static
7792PyObject *unicode_repr(PyObject *unicode)
7793{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007794 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007795 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007796 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7797 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7798
7799 /* XXX(nnorwitz): rather than over-allocating, it would be
7800 better to choose a different scheme. Perhaps scan the
7801 first N-chars of the string and allocate based on that size.
7802 */
7803 /* Initial allocation is based on the longest-possible unichr
7804 escape.
7805
7806 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7807 unichr, so in this case it's the longest unichr escape. In
7808 narrow (UTF-16) builds this is five chars per source unichr
7809 since there are two unichrs in the surrogate pair, so in narrow
7810 (UTF-16) builds it's not the longest unichr escape.
7811
7812 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7813 so in the narrow (UTF-16) build case it's the longest unichr
7814 escape.
7815 */
7816
Walter Dörwald1ab83302007-05-18 17:15:44 +00007817 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007819#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007821#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007823#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007825 if (repr == NULL)
7826 return NULL;
7827
Walter Dörwald1ab83302007-05-18 17:15:44 +00007828 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007829
7830 /* Add quote */
7831 *p++ = (findchar(s, size, '\'') &&
7832 !findchar(s, size, '"')) ? '"' : '\'';
7833 while (size-- > 0) {
7834 Py_UNICODE ch = *s++;
7835
7836 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007837 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007838 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007839 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007840 continue;
7841 }
7842
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007844 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007845 *p++ = '\\';
7846 *p++ = 't';
7847 }
7848 else if (ch == '\n') {
7849 *p++ = '\\';
7850 *p++ = 'n';
7851 }
7852 else if (ch == '\r') {
7853 *p++ = '\\';
7854 *p++ = 'r';
7855 }
7856
7857 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007858 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007859 *p++ = '\\';
7860 *p++ = 'x';
7861 *p++ = hexdigits[(ch >> 4) & 0x000F];
7862 *p++ = hexdigits[ch & 0x000F];
7863 }
7864
Georg Brandl559e5d72008-06-11 18:37:52 +00007865 /* Copy ASCII characters as-is */
7866 else if (ch < 0x7F) {
7867 *p++ = ch;
7868 }
7869
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007871 else {
7872 Py_UCS4 ucs = ch;
7873
7874#ifndef Py_UNICODE_WIDE
7875 Py_UNICODE ch2 = 0;
7876 /* Get code point from surrogate pair */
7877 if (size > 0) {
7878 ch2 = *s;
7879 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007884 size--;
7885 }
7886 }
7887#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007889 (categories Z* and C* except ASCII space)
7890 */
7891 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7892 /* Map 8-bit characters to '\xhh' */
7893 if (ucs <= 0xff) {
7894 *p++ = '\\';
7895 *p++ = 'x';
7896 *p++ = hexdigits[(ch >> 4) & 0x000F];
7897 *p++ = hexdigits[ch & 0x000F];
7898 }
7899 /* Map 21-bit characters to '\U00xxxxxx' */
7900 else if (ucs >= 0x10000) {
7901 *p++ = '\\';
7902 *p++ = 'U';
7903 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7904 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7905 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7906 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7907 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7908 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7909 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7910 *p++ = hexdigits[ucs & 0x0000000F];
7911 }
7912 /* Map 16-bit characters to '\uxxxx' */
7913 else {
7914 *p++ = '\\';
7915 *p++ = 'u';
7916 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7917 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7918 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7919 *p++ = hexdigits[ucs & 0x000F];
7920 }
7921 }
7922 /* Copy characters as-is */
7923 else {
7924 *p++ = ch;
7925#ifndef Py_UNICODE_WIDE
7926 if (ucs >= 0x10000)
7927 *p++ = ch2;
7928#endif
7929 }
7930 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007931 }
7932 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007933 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007934
7935 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007936 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007937 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938}
7939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007940PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942\n\
7943Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007944such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945arguments start and end are interpreted as in slice notation.\n\
7946\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007947Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948
7949static PyObject *
7950unicode_rfind(PyUnicodeObject *self, PyObject *args)
7951{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007953 Py_ssize_t start;
7954 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007955 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956
Christian Heimes9cd17752007-11-18 19:35:23 +00007957 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00007958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960 result = stringlib_rfind_slice(
7961 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7962 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7963 start, end
7964 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965
7966 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007967
Christian Heimes217cfd12007-12-02 14:31:20 +00007968 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969}
7970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007971PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007974Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
7976static PyObject *
7977unicode_rindex(PyUnicodeObject *self, PyObject *args)
7978{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007979 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007980 Py_ssize_t start;
7981 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007982 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983
Christian Heimes9cd17752007-11-18 19:35:23 +00007984 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00007985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986
Thomas Wouters477c8d52006-05-27 19:21:47 +00007987 result = stringlib_rfind_slice(
7988 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7989 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7990 start, end
7991 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992
7993 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 if (result < 0) {
7996 PyErr_SetString(PyExc_ValueError, "substring not found");
7997 return NULL;
7998 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007999 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000}
8001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008002PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008005Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008006done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007
8008static PyObject *
8009unicode_rjust(PyUnicodeObject *self, PyObject *args)
8010{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008011 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008012 Py_UNICODE fillchar = ' ';
8013
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008014 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 return NULL;
8016
Tim Peters7a29bd52001-09-12 03:03:31 +00008017 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 Py_INCREF(self);
8019 return (PyObject*) self;
8020 }
8021
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008022 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023}
8024
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 PyObject *sep,
8027 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028{
8029 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 s = PyUnicode_FromObject(s);
8032 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 if (sep != NULL) {
8035 sep = PyUnicode_FromObject(sep);
8036 if (sep == NULL) {
8037 Py_DECREF(s);
8038 return NULL;
8039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
8041
8042 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8043
8044 Py_DECREF(s);
8045 Py_XDECREF(sep);
8046 return result;
8047}
8048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008049PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051\n\
8052Return a list of the words in S, using sep as the\n\
8053delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008054splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008055whitespace string is a separator and empty strings are\n\
8056removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
8058static PyObject*
8059unicode_split(PyUnicodeObject *self, PyObject *args)
8060{
8061 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
Martin v. Löwis18e16552006-02-15 17:27:45 +00008064 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 return NULL;
8066
8067 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
8074
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075PyObject *
8076PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8077{
8078 PyObject* str_obj;
8079 PyObject* sep_obj;
8080 PyObject* out;
8081
8082 str_obj = PyUnicode_FromObject(str_in);
8083 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085 sep_obj = PyUnicode_FromObject(sep_in);
8086 if (!sep_obj) {
8087 Py_DECREF(str_obj);
8088 return NULL;
8089 }
8090
8091 out = stringlib_partition(
8092 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8093 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8094 );
8095
8096 Py_DECREF(sep_obj);
8097 Py_DECREF(str_obj);
8098
8099 return out;
8100}
8101
8102
8103PyObject *
8104PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8105{
8106 PyObject* str_obj;
8107 PyObject* sep_obj;
8108 PyObject* out;
8109
8110 str_obj = PyUnicode_FromObject(str_in);
8111 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008113 sep_obj = PyUnicode_FromObject(sep_in);
8114 if (!sep_obj) {
8115 Py_DECREF(str_obj);
8116 return NULL;
8117 }
8118
8119 out = stringlib_rpartition(
8120 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8121 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8122 );
8123
8124 Py_DECREF(sep_obj);
8125 Py_DECREF(str_obj);
8126
8127 return out;
8128}
8129
8130PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008132\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008133Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008134the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008135found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008136
8137static PyObject*
8138unicode_partition(PyUnicodeObject *self, PyObject *separator)
8139{
8140 return PyUnicode_Partition((PyObject *)self, separator);
8141}
8142
8143PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008145\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008146Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008147the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008148separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008149
8150static PyObject*
8151unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8152{
8153 return PyUnicode_RPartition((PyObject *)self, separator);
8154}
8155
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008156PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 PyObject *sep,
8158 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008159{
8160 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008162 s = PyUnicode_FromObject(s);
8163 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 if (sep != NULL) {
8166 sep = PyUnicode_FromObject(sep);
8167 if (sep == NULL) {
8168 Py_DECREF(s);
8169 return NULL;
8170 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008171 }
8172
8173 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8174
8175 Py_DECREF(s);
8176 Py_XDECREF(sep);
8177 return result;
8178}
8179
8180PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008182\n\
8183Return a list of the words in S, using sep as the\n\
8184delimiter string, starting at the end of the string and\n\
8185working to the front. If maxsplit is given, at most maxsplit\n\
8186splits are done. If sep is not specified, any whitespace string\n\
8187is a separator.");
8188
8189static PyObject*
8190unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8191{
8192 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008193 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008194
Martin v. Löwis18e16552006-02-15 17:27:45 +00008195 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008196 return NULL;
8197
8198 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008200 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008202 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008204}
8205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008206PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208\n\
8209Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008210Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008211is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
8213static PyObject*
8214unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8215{
Guido van Rossum86662912000-04-11 15:38:46 +00008216 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217
Guido van Rossum86662912000-04-11 15:38:46 +00008218 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 return NULL;
8220
Guido van Rossum86662912000-04-11 15:38:46 +00008221 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222}
8223
8224static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008225PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226{
Walter Dörwald346737f2007-05-31 10:44:43 +00008227 if (PyUnicode_CheckExact(self)) {
8228 Py_INCREF(self);
8229 return self;
8230 } else
8231 /* Subtype -- return genuine unicode string with the same value. */
8232 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8233 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
8235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008236PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238\n\
8239Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008240and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241
8242static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008243unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 return fixup(self, fixswapcase);
8246}
8247
Georg Brandlceee0772007-11-27 23:48:05 +00008248PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008250\n\
8251Return a translation table usable for str.translate().\n\
8252If there is only one argument, it must be a dictionary mapping Unicode\n\
8253ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008254Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008255If there are two arguments, they must be strings of equal length, and\n\
8256in the resulting dictionary, each character in x will be mapped to the\n\
8257character at the same position in y. If there is a third argument, it\n\
8258must be a string, whose characters will be mapped to None in the result.");
8259
8260static PyObject*
8261unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8262{
8263 PyObject *x, *y = NULL, *z = NULL;
8264 PyObject *new = NULL, *key, *value;
8265 Py_ssize_t i = 0;
8266 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267
Georg Brandlceee0772007-11-27 23:48:05 +00008268 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8269 return NULL;
8270 new = PyDict_New();
8271 if (!new)
8272 return NULL;
8273 if (y != NULL) {
8274 /* x must be a string too, of equal length */
8275 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8276 if (!PyUnicode_Check(x)) {
8277 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8278 "be a string if there is a second argument");
8279 goto err;
8280 }
8281 if (PyUnicode_GET_SIZE(x) != ylen) {
8282 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8283 "arguments must have equal length");
8284 goto err;
8285 }
8286 /* create entries for translating chars in x to those in y */
8287 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008288 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8289 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008290 if (!key || !value)
8291 goto err;
8292 res = PyDict_SetItem(new, key, value);
8293 Py_DECREF(key);
8294 Py_DECREF(value);
8295 if (res < 0)
8296 goto err;
8297 }
8298 /* create entries for deleting chars in z */
8299 if (z != NULL) {
8300 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008301 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008302 if (!key)
8303 goto err;
8304 res = PyDict_SetItem(new, key, Py_None);
8305 Py_DECREF(key);
8306 if (res < 0)
8307 goto err;
8308 }
8309 }
8310 } else {
8311 /* x must be a dict */
8312 if (!PyDict_Check(x)) {
8313 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8314 "to maketrans it must be a dict");
8315 goto err;
8316 }
8317 /* copy entries into the new dict, converting string keys to int keys */
8318 while (PyDict_Next(x, &i, &key, &value)) {
8319 if (PyUnicode_Check(key)) {
8320 /* convert string keys to integer keys */
8321 PyObject *newkey;
8322 if (PyUnicode_GET_SIZE(key) != 1) {
8323 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8324 "table must be of length 1");
8325 goto err;
8326 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008327 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008328 if (!newkey)
8329 goto err;
8330 res = PyDict_SetItem(new, newkey, value);
8331 Py_DECREF(newkey);
8332 if (res < 0)
8333 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008334 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008335 /* just keep integer keys */
8336 if (PyDict_SetItem(new, key, value) < 0)
8337 goto err;
8338 } else {
8339 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8340 "be strings or integers");
8341 goto err;
8342 }
8343 }
8344 }
8345 return new;
8346 err:
8347 Py_DECREF(new);
8348 return NULL;
8349}
8350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008351PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353\n\
8354Return a copy of the string S, where all characters have been mapped\n\
8355through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008356Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008357Unmapped characters are left untouched. Characters mapped to None\n\
8358are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359
8360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008361unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
Georg Brandlceee0772007-11-27 23:48:05 +00008363 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364}
8365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008366PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008369Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370
8371static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008372unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 return fixup(self, fixupper);
8375}
8376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008377PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008380Pad a numeric string S with zeros on the left, to fill a field\n\
8381of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382
8383static PyObject *
8384unicode_zfill(PyUnicodeObject *self, PyObject *args)
8385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008386 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 PyUnicodeObject *u;
8388
Martin v. Löwis18e16552006-02-15 17:27:45 +00008389 Py_ssize_t width;
8390 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 return NULL;
8392
8393 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008394 if (PyUnicode_CheckExact(self)) {
8395 Py_INCREF(self);
8396 return (PyObject*) self;
8397 }
8398 else
8399 return PyUnicode_FromUnicode(
8400 PyUnicode_AS_UNICODE(self),
8401 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 }
8404
8405 fill = width - self->length;
8406
8407 u = pad(self, fill, 0, '0');
8408
Walter Dörwald068325e2002-04-15 13:36:47 +00008409 if (u == NULL)
8410 return NULL;
8411
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 if (u->str[fill] == '+' || u->str[fill] == '-') {
8413 /* move sign to beginning of string */
8414 u->str[0] = u->str[fill];
8415 u->str[fill] = '0';
8416 }
8417
8418 return (PyObject*) u;
8419}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420
8421#if 0
8422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008423unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424{
Christian Heimes2202f872008-02-06 14:31:34 +00008425 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426}
8427#endif
8428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008429PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008432Return True if S starts with the specified prefix, False otherwise.\n\
8433With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008434With optional end, stop comparing S at that position.\n\
8435prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436
8437static PyObject *
8438unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008441 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008443 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008444 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008445 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008447 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8449 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008450 if (PyTuple_Check(subobj)) {
8451 Py_ssize_t i;
8452 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8453 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008455 if (substring == NULL)
8456 return NULL;
8457 result = tailmatch(self, substring, start, end, -1);
8458 Py_DECREF(substring);
8459 if (result) {
8460 Py_RETURN_TRUE;
8461 }
8462 }
8463 /* nothing matched */
8464 Py_RETURN_FALSE;
8465 }
8466 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008469 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008471 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472}
8473
8474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008475PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008478Return True if S ends with the specified suffix, False otherwise.\n\
8479With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008480With optional end, stop comparing S at that position.\n\
8481suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482
8483static PyObject *
8484unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008487 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008489 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008490 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008491 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008493 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8495 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008496 if (PyTuple_Check(subobj)) {
8497 Py_ssize_t i;
8498 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8499 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008501 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008503 result = tailmatch(self, substring, start, end, +1);
8504 Py_DECREF(substring);
8505 if (result) {
8506 Py_RETURN_TRUE;
8507 }
8508 }
8509 Py_RETURN_FALSE;
8510 }
8511 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008515 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008517 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518}
8519
Eric Smith8c663262007-08-25 02:26:07 +00008520#include "stringlib/string_format.h"
8521
8522PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008524\n\
8525");
8526
Eric Smith4a7d76d2008-05-30 18:10:19 +00008527static PyObject *
8528unicode__format__(PyObject* self, PyObject* args)
8529{
8530 PyObject *format_spec;
8531
8532 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8533 return NULL;
8534
8535 return _PyUnicode_FormatAdvanced(self,
8536 PyUnicode_AS_UNICODE(format_spec),
8537 PyUnicode_GET_SIZE(format_spec));
8538}
8539
Eric Smith8c663262007-08-25 02:26:07 +00008540PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008542\n\
8543");
8544
8545static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008546unicode__sizeof__(PyUnicodeObject *v)
8547{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008548 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8549 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008550}
8551
8552PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008554
8555static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008556unicode_getnewargs(PyUnicodeObject *v)
8557{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008558 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008559}
8560
8561
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562static PyMethodDef unicode_methods[] = {
8563
8564 /* Order is according to common usage: often used methods should
8565 appear first, since lookup is done sequentially. */
8566
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008567 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8568 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8569 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008570 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008571 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8572 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8573 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8574 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8575 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8576 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8577 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008578 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008579 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8580 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8581 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008582 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008583 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8584 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8585 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008586 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008587 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008588 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008589 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008590 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8591 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8592 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8593 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8594 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8595 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8596 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8597 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8598 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8599 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8600 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8601 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8602 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8603 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008604 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008605 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008606 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008607 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008608 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008609 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8610 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008611 {"maketrans", (PyCFunction) unicode_maketrans,
8612 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008613 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008614#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008615 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616#endif
8617
8618#if 0
8619 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008620 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621#endif
8622
Benjamin Peterson14339b62009-01-31 16:36:08 +00008623 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 {NULL, NULL}
8625};
8626
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008627static PyObject *
8628unicode_mod(PyObject *v, PyObject *w)
8629{
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 if (!PyUnicode_Check(v)) {
8631 Py_INCREF(Py_NotImplemented);
8632 return Py_NotImplemented;
8633 }
8634 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008635}
8636
8637static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008638 0, /*nb_add*/
8639 0, /*nb_subtract*/
8640 0, /*nb_multiply*/
8641 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008642};
8643
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 (lenfunc) unicode_length, /* sq_length */
8646 PyUnicode_Concat, /* sq_concat */
8647 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8648 (ssizeargfunc) unicode_getitem, /* sq_item */
8649 0, /* sq_slice */
8650 0, /* sq_ass_item */
8651 0, /* sq_ass_slice */
8652 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653};
8654
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008655static PyObject*
8656unicode_subscript(PyUnicodeObject* self, PyObject* item)
8657{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008658 if (PyIndex_Check(item)) {
8659 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008660 if (i == -1 && PyErr_Occurred())
8661 return NULL;
8662 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008663 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008664 return unicode_getitem(self, i);
8665 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008667 Py_UNICODE* source_buf;
8668 Py_UNICODE* result_buf;
8669 PyObject* result;
8670
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008671 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008673 return NULL;
8674 }
8675
8676 if (slicelength <= 0) {
8677 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008678 } else if (start == 0 && step == 1 && slicelength == self->length &&
8679 PyUnicode_CheckExact(self)) {
8680 Py_INCREF(self);
8681 return (PyObject *)self;
8682 } else if (step == 1) {
8683 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008684 } else {
8685 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008686 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8687 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 if (result_buf == NULL)
8690 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008691
8692 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8693 result_buf[i] = source_buf[cur];
8694 }
Tim Petersced69f82003-09-16 20:30:58 +00008695
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008696 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008697 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008698 return result;
8699 }
8700 } else {
8701 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8702 return NULL;
8703 }
8704}
8705
8706static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008707 (lenfunc)unicode_length, /* mp_length */
8708 (binaryfunc)unicode_subscript, /* mp_subscript */
8709 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008710};
8711
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713/* Helpers for PyUnicode_Format() */
8714
8715static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008716getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008718 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 (*p_argidx)++;
8721 if (arglen < 0)
8722 return args;
8723 else
8724 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 }
8726 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 return NULL;
8729}
8730
Martin v. Löwis18e16552006-02-15 17:27:45 +00008731static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008732strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008734 register Py_ssize_t i;
8735 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 for (i = len - 1; i >= 0; i--)
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 return len;
8740}
8741
Neal Norwitzfc76d632006-01-10 06:03:13 +00008742static int
8743doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8744{
Tim Peters15231542006-02-16 01:08:01 +00008745 Py_ssize_t result;
8746
Neal Norwitzfc76d632006-01-10 06:03:13 +00008747 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008748 result = strtounicode(buffer, (char *)buffer);
8749 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008750}
8751
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008753static int
8754longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8755{
Tim Peters15231542006-02-16 01:08:01 +00008756 Py_ssize_t result;
8757
Neal Norwitzfc76d632006-01-10 06:03:13 +00008758 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008759 result = strtounicode(buffer, (char *)buffer);
8760 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008761}
Christian Heimes3fd13992008-03-21 01:05:49 +00008762#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008763
Guido van Rossum078151d2002-08-11 04:24:12 +00008764/* XXX To save some code duplication, formatfloat/long/int could have been
8765 shared with stringobject.c, converting from 8-bit to Unicode after the
8766 formatting is done. */
8767
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768static int
8769formatfloat(Py_UNICODE *buf,
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 size_t buflen,
8771 int flags,
8772 int prec,
8773 int type,
8774 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008776 /* fmt = '%#.' + `prec` + `type`
8777 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 char fmt[20];
8779 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008780
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 x = PyFloat_AsDouble(v);
8782 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008786 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008788 /* Worst case length calc to ensure no buffer overrun:
8789
8790 'g' formats:
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 fmt = %#.<prec>g
8792 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8793 for any double rep.)
8794 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008795
8796 'f' formats:
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8798 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008799
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008800 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008801 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008802
8803 */
Benjamin Peterson14339b62009-01-31 16:36:08 +00008804 if (((type == 'g' || type == 'G') &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 buflen <= (size_t)10 + (size_t)prec) ||
8806 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8807 PyErr_SetString(PyExc_OverflowError,
8808 "formatted float is too long (precision too large?)");
8809 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008810 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008811 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 (flags&F_ALT) ? "#" : "",
8813 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008814 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815}
8816
Tim Peters38fd5b62000-09-21 05:43:11 +00008817static PyObject*
8818formatlong(PyObject *val, int flags, int prec, int type)
8819{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008820 char *buf;
8821 int len;
8822 PyObject *str; /* temporary string object. */
8823 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008824
Benjamin Peterson14339b62009-01-31 16:36:08 +00008825 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8826 if (!str)
8827 return NULL;
8828 result = PyUnicode_FromStringAndSize(buf, len);
8829 Py_DECREF(str);
8830 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008831}
8832
Christian Heimes3fd13992008-03-21 01:05:49 +00008833#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834static int
8835formatint(Py_UNICODE *buf,
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 size_t buflen,
8837 int flags,
8838 int prec,
8839 int type,
8840 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008842 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008843 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8844 * + 1 + 1
8845 * = 24
8846 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008847 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008848 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 long x;
8850
Christian Heimes217cfd12007-12-02 14:31:20 +00008851 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008853 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008854 if (x < 0 && type == 'u') {
8855 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008856 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008857 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8858 sign = "-";
8859 else
8860 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008862 prec = 1;
8863
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008864 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8865 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008866 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008867 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008868 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008870 return -1;
8871 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008872
8873 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008874 (type == 'x' || type == 'X' || type == 'o')) {
8875 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008876 * of issues that cause pain:
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008878 * - when 0 is being converted, the C standard leaves off
8879 * the '0x' or '0X', which is inconsistent with other
8880 * %#x/%#X conversions and inconsistent with Python's
8881 * hex() function
8882 * - there are platforms that violate the standard and
8883 * convert 0 with the '0x' or '0X'
8884 * (Metrowerks, Compaq Tru64)
8885 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008886 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008887 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008888 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008889 * We can achieve the desired consistency by inserting our
8890 * own '0x' or '0X' prefix, and substituting %x/%X in place
8891 * of %#x/%#X.
8892 *
8893 * Note that this is the same approach as used in
8894 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008895 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008896 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8897 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008898 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008899 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008900 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8901 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008902 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008903 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008904 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008905 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008906 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008907 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908}
Christian Heimes3fd13992008-03-21 01:05:49 +00008909#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910
8911static int
8912formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008913 size_t buflen,
8914 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008916 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008917 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 if (PyUnicode_GET_SIZE(v) == 1) {
8919 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8920 buf[1] = '\0';
8921 return 1;
8922 }
8923#ifndef Py_UNICODE_WIDE
8924 if (PyUnicode_GET_SIZE(v) == 2) {
8925 /* Decode a valid surrogate pair */
8926 int c0 = PyUnicode_AS_UNICODE(v)[0];
8927 int c1 = PyUnicode_AS_UNICODE(v)[1];
8928 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8929 0xDC00 <= c1 && c1 <= 0xDFFF) {
8930 buf[0] = c0;
8931 buf[1] = c1;
8932 buf[2] = '\0';
8933 return 2;
8934 }
8935 }
8936#endif
8937 goto onError;
8938 }
8939 else {
8940 /* Integer input truncated to a character */
8941 long x;
8942 x = PyLong_AsLong(v);
8943 if (x == -1 && PyErr_Occurred())
8944 goto onError;
8945
8946 if (x < 0 || x > 0x10ffff) {
8947 PyErr_SetString(PyExc_OverflowError,
8948 "%c arg not in range(0x110000)");
8949 return -1;
8950 }
8951
8952#ifndef Py_UNICODE_WIDE
8953 if (x > 0xffff) {
8954 x -= 0x10000;
8955 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8956 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8957 return 2;
8958 }
8959#endif
8960 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008961 buf[1] = '\0';
8962 return 1;
8963 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008964
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008966 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008968 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969}
8970
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008971/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8972
8973 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8974 chars are formatted. XXX This is a magic number. Each formatting
8975 routine does bounds checking to ensure no overflow, but a better
8976 solution may be to malloc a buffer of appropriate size for each
8977 format. For now, the current solution is sufficient.
8978*/
8979#define FORMATBUFLEN (size_t)120
8980
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
8984 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008985 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 int args_owned = 0;
8987 PyUnicodeObject *result = NULL;
8988 PyObject *dict = NULL;
8989 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008990
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 PyErr_BadInternalCall();
8993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 }
8995 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008996 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 fmt = PyUnicode_AS_UNICODE(uformat);
8999 fmtcnt = PyUnicode_GET_SIZE(uformat);
9000
9001 reslen = rescnt = fmtcnt + 100;
9002 result = _PyUnicode_New(reslen);
9003 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 res = PyUnicode_AS_UNICODE(result);
9006
9007 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 arglen = PyTuple_Size(args);
9009 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 }
9011 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 arglen = -1;
9013 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009015 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009016 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018
9019 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 if (*fmt != '%') {
9021 if (--rescnt < 0) {
9022 rescnt = fmtcnt + 100;
9023 reslen += rescnt;
9024 if (_PyUnicode_Resize(&result, reslen) < 0)
9025 goto onError;
9026 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9027 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009028 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 }
9031 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 /* Got a format specifier */
9033 int flags = 0;
9034 Py_ssize_t width = -1;
9035 int prec = -1;
9036 Py_UNICODE c = '\0';
9037 Py_UNICODE fill;
9038 int isnumok;
9039 PyObject *v = NULL;
9040 PyObject *temp = NULL;
9041 Py_UNICODE *pbuf;
9042 Py_UNICODE sign;
9043 Py_ssize_t len;
9044 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 fmt++;
9047 if (*fmt == '(') {
9048 Py_UNICODE *keystart;
9049 Py_ssize_t keylen;
9050 PyObject *key;
9051 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009052
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 if (dict == NULL) {
9054 PyErr_SetString(PyExc_TypeError,
9055 "format requires a mapping");
9056 goto onError;
9057 }
9058 ++fmt;
9059 --fmtcnt;
9060 keystart = fmt;
9061 /* Skip over balanced parentheses */
9062 while (pcount > 0 && --fmtcnt >= 0) {
9063 if (*fmt == ')')
9064 --pcount;
9065 else if (*fmt == '(')
9066 ++pcount;
9067 fmt++;
9068 }
9069 keylen = fmt - keystart - 1;
9070 if (fmtcnt < 0 || pcount > 0) {
9071 PyErr_SetString(PyExc_ValueError,
9072 "incomplete format key");
9073 goto onError;
9074 }
9075#if 0
9076 /* keys are converted to strings using UTF-8 and
9077 then looked up since Python uses strings to hold
9078 variables names etc. in its namespaces and we
9079 wouldn't want to break common idioms. */
9080 key = PyUnicode_EncodeUTF8(keystart,
9081 keylen,
9082 NULL);
9083#else
9084 key = PyUnicode_FromUnicode(keystart, keylen);
9085#endif
9086 if (key == NULL)
9087 goto onError;
9088 if (args_owned) {
9089 Py_DECREF(args);
9090 args_owned = 0;
9091 }
9092 args = PyObject_GetItem(dict, key);
9093 Py_DECREF(key);
9094 if (args == NULL) {
9095 goto onError;
9096 }
9097 args_owned = 1;
9098 arglen = -1;
9099 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 while (--fmtcnt >= 0) {
9102 switch (c = *fmt++) {
9103 case '-': flags |= F_LJUST; continue;
9104 case '+': flags |= F_SIGN; continue;
9105 case ' ': flags |= F_BLANK; continue;
9106 case '#': flags |= F_ALT; continue;
9107 case '0': flags |= F_ZERO; continue;
9108 }
9109 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009110 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 if (c == '*') {
9112 v = getnextarg(args, arglen, &argidx);
9113 if (v == NULL)
9114 goto onError;
9115 if (!PyLong_Check(v)) {
9116 PyErr_SetString(PyExc_TypeError,
9117 "* wants int");
9118 goto onError;
9119 }
9120 width = PyLong_AsLong(v);
9121 if (width == -1 && PyErr_Occurred())
9122 goto onError;
9123 if (width < 0) {
9124 flags |= F_LJUST;
9125 width = -width;
9126 }
9127 if (--fmtcnt >= 0)
9128 c = *fmt++;
9129 }
9130 else if (c >= '0' && c <= '9') {
9131 width = c - '0';
9132 while (--fmtcnt >= 0) {
9133 c = *fmt++;
9134 if (c < '0' || c > '9')
9135 break;
9136 if ((width*10) / 10 != width) {
9137 PyErr_SetString(PyExc_ValueError,
9138 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009139 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 }
9141 width = width*10 + (c - '0');
9142 }
9143 }
9144 if (c == '.') {
9145 prec = 0;
9146 if (--fmtcnt >= 0)
9147 c = *fmt++;
9148 if (c == '*') {
9149 v = getnextarg(args, arglen, &argidx);
9150 if (v == NULL)
9151 goto onError;
9152 if (!PyLong_Check(v)) {
9153 PyErr_SetString(PyExc_TypeError,
9154 "* wants int");
9155 goto onError;
9156 }
9157 prec = PyLong_AsLong(v);
9158 if (prec == -1 && PyErr_Occurred())
9159 goto onError;
9160 if (prec < 0)
9161 prec = 0;
9162 if (--fmtcnt >= 0)
9163 c = *fmt++;
9164 }
9165 else if (c >= '0' && c <= '9') {
9166 prec = c - '0';
9167 while (--fmtcnt >= 0) {
9168 c = Py_CHARMASK(*fmt++);
9169 if (c < '0' || c > '9')
9170 break;
9171 if ((prec*10) / 10 != prec) {
9172 PyErr_SetString(PyExc_ValueError,
9173 "prec too big");
9174 goto onError;
9175 }
9176 prec = prec*10 + (c - '0');
9177 }
9178 }
9179 } /* prec */
9180 if (fmtcnt >= 0) {
9181 if (c == 'h' || c == 'l' || c == 'L') {
9182 if (--fmtcnt >= 0)
9183 c = *fmt++;
9184 }
9185 }
9186 if (fmtcnt < 0) {
9187 PyErr_SetString(PyExc_ValueError,
9188 "incomplete format");
9189 goto onError;
9190 }
9191 if (c != '%') {
9192 v = getnextarg(args, arglen, &argidx);
9193 if (v == NULL)
9194 goto onError;
9195 }
9196 sign = 0;
9197 fill = ' ';
9198 switch (c) {
9199
9200 case '%':
9201 pbuf = formatbuf;
9202 /* presume that buffer length is at least 1 */
9203 pbuf[0] = '%';
9204 len = 1;
9205 break;
9206
9207 case 's':
9208 case 'r':
9209 case 'a':
9210 if (PyUnicode_Check(v) && c == 's') {
9211 temp = v;
9212 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009213 }
9214 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 if (c == 's')
9216 temp = PyObject_Str(v);
9217 else if (c == 'r')
9218 temp = PyObject_Repr(v);
9219 else
9220 temp = PyObject_ASCII(v);
9221 if (temp == NULL)
9222 goto onError;
9223 if (PyUnicode_Check(temp))
9224 /* nothing to do */;
9225 else {
9226 Py_DECREF(temp);
9227 PyErr_SetString(PyExc_TypeError,
9228 "%s argument has non-string str()");
9229 goto onError;
9230 }
9231 }
9232 pbuf = PyUnicode_AS_UNICODE(temp);
9233 len = PyUnicode_GET_SIZE(temp);
9234 if (prec >= 0 && len > prec)
9235 len = prec;
9236 break;
9237
9238 case 'i':
9239 case 'd':
9240 case 'u':
9241 case 'o':
9242 case 'x':
9243 case 'X':
9244 if (c == 'i')
9245 c = 'd';
9246 isnumok = 0;
9247 if (PyNumber_Check(v)) {
9248 PyObject *iobj=NULL;
9249
9250 if (PyLong_Check(v)) {
9251 iobj = v;
9252 Py_INCREF(iobj);
9253 }
9254 else {
9255 iobj = PyNumber_Long(v);
9256 }
9257 if (iobj!=NULL) {
9258 if (PyLong_Check(iobj)) {
9259 isnumok = 1;
9260 temp = formatlong(iobj, flags, prec, c);
9261 Py_DECREF(iobj);
9262 if (!temp)
9263 goto onError;
9264 pbuf = PyUnicode_AS_UNICODE(temp);
9265 len = PyUnicode_GET_SIZE(temp);
9266 sign = 1;
9267 }
9268 else {
9269 Py_DECREF(iobj);
9270 }
9271 }
9272 }
9273 if (!isnumok) {
9274 PyErr_Format(PyExc_TypeError,
9275 "%%%c format: a number is required, "
9276 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9277 goto onError;
9278 }
9279 if (flags & F_ZERO)
9280 fill = '0';
9281 break;
9282
9283 case 'e':
9284 case 'E':
9285 case 'f':
9286 case 'F':
9287 case 'g':
9288 case 'G':
9289 if (c == 'F')
9290 c = 'f';
9291 pbuf = formatbuf;
9292 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9293 flags, prec, c, v);
9294 if (len < 0)
9295 goto onError;
9296 sign = 1;
9297 if (flags & F_ZERO)
9298 fill = '0';
9299 break;
9300
9301 case 'c':
9302 pbuf = formatbuf;
9303 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9304 if (len < 0)
9305 goto onError;
9306 break;
9307
9308 default:
9309 PyErr_Format(PyExc_ValueError,
9310 "unsupported format character '%c' (0x%x) "
9311 "at index %zd",
9312 (31<=c && c<=126) ? (char)c : '?',
9313 (int)c,
9314 (Py_ssize_t)(fmt - 1 -
9315 PyUnicode_AS_UNICODE(uformat)));
9316 goto onError;
9317 }
9318 if (sign) {
9319 if (*pbuf == '-' || *pbuf == '+') {
9320 sign = *pbuf++;
9321 len--;
9322 }
9323 else if (flags & F_SIGN)
9324 sign = '+';
9325 else if (flags & F_BLANK)
9326 sign = ' ';
9327 else
9328 sign = 0;
9329 }
9330 if (width < len)
9331 width = len;
9332 if (rescnt - (sign != 0) < width) {
9333 reslen -= rescnt;
9334 rescnt = width + fmtcnt + 100;
9335 reslen += rescnt;
9336 if (reslen < 0) {
9337 Py_XDECREF(temp);
9338 PyErr_NoMemory();
9339 goto onError;
9340 }
9341 if (_PyUnicode_Resize(&result, reslen) < 0) {
9342 Py_XDECREF(temp);
9343 goto onError;
9344 }
9345 res = PyUnicode_AS_UNICODE(result)
9346 + reslen - rescnt;
9347 }
9348 if (sign) {
9349 if (fill != ' ')
9350 *res++ = sign;
9351 rescnt--;
9352 if (width > len)
9353 width--;
9354 }
9355 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9356 assert(pbuf[0] == '0');
9357 assert(pbuf[1] == c);
9358 if (fill != ' ') {
9359 *res++ = *pbuf++;
9360 *res++ = *pbuf++;
9361 }
9362 rescnt -= 2;
9363 width -= 2;
9364 if (width < 0)
9365 width = 0;
9366 len -= 2;
9367 }
9368 if (width > len && !(flags & F_LJUST)) {
9369 do {
9370 --rescnt;
9371 *res++ = fill;
9372 } while (--width > len);
9373 }
9374 if (fill == ' ') {
9375 if (sign)
9376 *res++ = sign;
9377 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9378 assert(pbuf[0] == '0');
9379 assert(pbuf[1] == c);
9380 *res++ = *pbuf++;
9381 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009382 }
9383 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 Py_UNICODE_COPY(res, pbuf, len);
9385 res += len;
9386 rescnt -= len;
9387 while (--width >= len) {
9388 --rescnt;
9389 *res++ = ' ';
9390 }
9391 if (dict && (argidx < arglen) && c != '%') {
9392 PyErr_SetString(PyExc_TypeError,
9393 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009394 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 goto onError;
9396 }
9397 Py_XDECREF(temp);
9398 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 } /* until end */
9400 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 PyErr_SetString(PyExc_TypeError,
9402 "not all arguments converted during string formatting");
9403 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 }
9405
Thomas Woutersa96affe2006-03-12 00:29:36 +00009406 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410 }
9411 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412 return (PyObject *)result;
9413
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 Py_XDECREF(result);
9416 Py_DECREF(uformat);
9417 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 }
9420 return NULL;
9421}
9422
Jeremy Hylton938ace62002-07-17 16:30:39 +00009423static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009424unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9425
Tim Peters6d6c1a32001-08-02 04:15:00 +00009426static PyObject *
9427unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9428{
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009430 static char *kwlist[] = {"object", "encoding", "errors", 0};
9431 char *encoding = NULL;
9432 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009433
Benjamin Peterson14339b62009-01-31 16:36:08 +00009434 if (type != &PyUnicode_Type)
9435 return unicode_subtype_new(type, args, kwds);
9436 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 return NULL;
9439 if (x == NULL)
9440 return (PyObject *)_PyUnicode_New(0);
9441 if (encoding == NULL && errors == NULL)
9442 return PyObject_Str(x);
9443 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009445}
9446
Guido van Rossume023fe02001-08-30 03:12:59 +00009447static PyObject *
9448unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9449{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009450 PyUnicodeObject *tmp, *pnew;
9451 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009452
Benjamin Peterson14339b62009-01-31 16:36:08 +00009453 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9454 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9455 if (tmp == NULL)
9456 return NULL;
9457 assert(PyUnicode_Check(tmp));
9458 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9459 if (pnew == NULL) {
9460 Py_DECREF(tmp);
9461 return NULL;
9462 }
9463 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9464 if (pnew->str == NULL) {
9465 _Py_ForgetReference((PyObject *)pnew);
9466 PyObject_Del(pnew);
9467 Py_DECREF(tmp);
9468 return PyErr_NoMemory();
9469 }
9470 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9471 pnew->length = n;
9472 pnew->hash = tmp->hash;
9473 Py_DECREF(tmp);
9474 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009475}
9476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009477PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009479\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009480Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009481encoding defaults to the current default string encoding.\n\
9482errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009483
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009484static PyObject *unicode_iter(PyObject *seq);
9485
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009487 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009488 "str", /* tp_name */
9489 sizeof(PyUnicodeObject), /* tp_size */
9490 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009492 (destructor)unicode_dealloc, /* tp_dealloc */
9493 0, /* tp_print */
9494 0, /* tp_getattr */
9495 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009496 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009497 unicode_repr, /* tp_repr */
9498 &unicode_as_number, /* tp_as_number */
9499 &unicode_as_sequence, /* tp_as_sequence */
9500 &unicode_as_mapping, /* tp_as_mapping */
9501 (hashfunc) unicode_hash, /* tp_hash*/
9502 0, /* tp_call*/
9503 (reprfunc) unicode_str, /* tp_str */
9504 PyObject_GenericGetAttr, /* tp_getattro */
9505 0, /* tp_setattro */
9506 0, /* tp_as_buffer */
9507 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009509 unicode_doc, /* tp_doc */
9510 0, /* tp_traverse */
9511 0, /* tp_clear */
9512 PyUnicode_RichCompare, /* tp_richcompare */
9513 0, /* tp_weaklistoffset */
9514 unicode_iter, /* tp_iter */
9515 0, /* tp_iternext */
9516 unicode_methods, /* tp_methods */
9517 0, /* tp_members */
9518 0, /* tp_getset */
9519 &PyBaseObject_Type, /* tp_base */
9520 0, /* tp_dict */
9521 0, /* tp_descr_get */
9522 0, /* tp_descr_set */
9523 0, /* tp_dictoffset */
9524 0, /* tp_init */
9525 0, /* tp_alloc */
9526 unicode_new, /* tp_new */
9527 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528};
9529
9530/* Initialize the Unicode implementation */
9531
Thomas Wouters78890102000-07-22 19:25:51 +00009532void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009534 int i;
9535
Thomas Wouters477c8d52006-05-27 19:21:47 +00009536 /* XXX - move this array to unicodectype.c ? */
9537 Py_UNICODE linebreak[] = {
9538 0x000A, /* LINE FEED */
9539 0x000D, /* CARRIAGE RETURN */
9540 0x001C, /* FILE SEPARATOR */
9541 0x001D, /* GROUP SEPARATOR */
9542 0x001E, /* RECORD SEPARATOR */
9543 0x0085, /* NEXT LINE */
9544 0x2028, /* LINE SEPARATOR */
9545 0x2029, /* PARAGRAPH SEPARATOR */
9546 };
9547
Fred Drakee4315f52000-05-09 19:53:39 +00009548 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009549 free_list = NULL;
9550 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009552 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009554
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009555 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009557 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009559
9560 /* initialize the linebreak bloom filter */
9561 bloom_linebreak = make_bloom_mask(
9562 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9563 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009564
9565 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566}
9567
9568/* Finalize the Unicode implementation */
9569
Christian Heimesa156e092008-02-16 07:38:31 +00009570int
9571PyUnicode_ClearFreeList(void)
9572{
9573 int freelist_size = numfree;
9574 PyUnicodeObject *u;
9575
9576 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 PyUnicodeObject *v = u;
9578 u = *(PyUnicodeObject **)u;
9579 if (v->str)
9580 PyObject_DEL(v->str);
9581 Py_XDECREF(v->defenc);
9582 PyObject_Del(v);
9583 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009584 }
9585 free_list = NULL;
9586 assert(numfree == 0);
9587 return freelist_size;
9588}
9589
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590void
Thomas Wouters78890102000-07-22 19:25:51 +00009591_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009593 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009595 Py_XDECREF(unicode_empty);
9596 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009597
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009598 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 if (unicode_latin1[i]) {
9600 Py_DECREF(unicode_latin1[i]);
9601 unicode_latin1[i] = NULL;
9602 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009603 }
Christian Heimesa156e092008-02-16 07:38:31 +00009604 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009606
Walter Dörwald16807132007-05-25 13:52:07 +00009607void
9608PyUnicode_InternInPlace(PyObject **p)
9609{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9611 PyObject *t;
9612 if (s == NULL || !PyUnicode_Check(s))
9613 Py_FatalError(
9614 "PyUnicode_InternInPlace: unicode strings only please!");
9615 /* If it's a subclass, we don't really know what putting
9616 it in the interned dict might do. */
9617 if (!PyUnicode_CheckExact(s))
9618 return;
9619 if (PyUnicode_CHECK_INTERNED(s))
9620 return;
9621 if (interned == NULL) {
9622 interned = PyDict_New();
9623 if (interned == NULL) {
9624 PyErr_Clear(); /* Don't leave an exception */
9625 return;
9626 }
9627 }
9628 /* It might be that the GetItem call fails even
9629 though the key is present in the dictionary,
9630 namely when this happens during a stack overflow. */
9631 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009633 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009634
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 if (t) {
9636 Py_INCREF(t);
9637 Py_DECREF(*p);
9638 *p = t;
9639 return;
9640 }
Walter Dörwald16807132007-05-25 13:52:07 +00009641
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 PyThreadState_GET()->recursion_critical = 1;
9643 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9644 PyErr_Clear();
9645 PyThreadState_GET()->recursion_critical = 0;
9646 return;
9647 }
9648 PyThreadState_GET()->recursion_critical = 0;
9649 /* The two references in interned are not counted by refcnt.
9650 The deallocator will take care of this */
9651 Py_REFCNT(s) -= 2;
9652 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009653}
9654
9655void
9656PyUnicode_InternImmortal(PyObject **p)
9657{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009658 PyUnicode_InternInPlace(p);
9659 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9660 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9661 Py_INCREF(*p);
9662 }
Walter Dörwald16807132007-05-25 13:52:07 +00009663}
9664
9665PyObject *
9666PyUnicode_InternFromString(const char *cp)
9667{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 PyObject *s = PyUnicode_FromString(cp);
9669 if (s == NULL)
9670 return NULL;
9671 PyUnicode_InternInPlace(&s);
9672 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009673}
9674
9675void _Py_ReleaseInternedUnicodeStrings(void)
9676{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 PyObject *keys;
9678 PyUnicodeObject *s;
9679 Py_ssize_t i, n;
9680 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009681
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682 if (interned == NULL || !PyDict_Check(interned))
9683 return;
9684 keys = PyDict_Keys(interned);
9685 if (keys == NULL || !PyList_Check(keys)) {
9686 PyErr_Clear();
9687 return;
9688 }
Walter Dörwald16807132007-05-25 13:52:07 +00009689
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9691 detector, interned unicode strings are not forcibly deallocated;
9692 rather, we give them their stolen references back, and then clear
9693 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009694
Benjamin Peterson14339b62009-01-31 16:36:08 +00009695 n = PyList_GET_SIZE(keys);
9696 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 for (i = 0; i < n; i++) {
9699 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9700 switch (s->state) {
9701 case SSTATE_NOT_INTERNED:
9702 /* XXX Shouldn't happen */
9703 break;
9704 case SSTATE_INTERNED_IMMORTAL:
9705 Py_REFCNT(s) += 1;
9706 immortal_size += s->length;
9707 break;
9708 case SSTATE_INTERNED_MORTAL:
9709 Py_REFCNT(s) += 2;
9710 mortal_size += s->length;
9711 break;
9712 default:
9713 Py_FatalError("Inconsistent interned string state.");
9714 }
9715 s->state = SSTATE_NOT_INTERNED;
9716 }
9717 fprintf(stderr, "total size of all interned strings: "
9718 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9719 "mortal/immortal\n", mortal_size, immortal_size);
9720 Py_DECREF(keys);
9721 PyDict_Clear(interned);
9722 Py_DECREF(interned);
9723 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009724}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009725
9726
9727/********************* Unicode Iterator **************************/
9728
9729typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 PyObject_HEAD
9731 Py_ssize_t it_index;
9732 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009733} unicodeiterobject;
9734
9735static void
9736unicodeiter_dealloc(unicodeiterobject *it)
9737{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009738 _PyObject_GC_UNTRACK(it);
9739 Py_XDECREF(it->it_seq);
9740 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009741}
9742
9743static int
9744unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009746 Py_VISIT(it->it_seq);
9747 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009748}
9749
9750static PyObject *
9751unicodeiter_next(unicodeiterobject *it)
9752{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009753 PyUnicodeObject *seq;
9754 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009755
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 assert(it != NULL);
9757 seq = it->it_seq;
9758 if (seq == NULL)
9759 return NULL;
9760 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009761
Benjamin Peterson14339b62009-01-31 16:36:08 +00009762 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9763 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 if (item != NULL)
9766 ++it->it_index;
9767 return item;
9768 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009769
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 Py_DECREF(seq);
9771 it->it_seq = NULL;
9772 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009773}
9774
9775static PyObject *
9776unicodeiter_len(unicodeiterobject *it)
9777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 Py_ssize_t len = 0;
9779 if (it->it_seq)
9780 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9781 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009782}
9783
9784PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9785
9786static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009787 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009789 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009790};
9791
9792PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009793 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9794 "str_iterator", /* tp_name */
9795 sizeof(unicodeiterobject), /* tp_basicsize */
9796 0, /* tp_itemsize */
9797 /* methods */
9798 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9799 0, /* tp_print */
9800 0, /* tp_getattr */
9801 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009802 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009803 0, /* tp_repr */
9804 0, /* tp_as_number */
9805 0, /* tp_as_sequence */
9806 0, /* tp_as_mapping */
9807 0, /* tp_hash */
9808 0, /* tp_call */
9809 0, /* tp_str */
9810 PyObject_GenericGetAttr, /* tp_getattro */
9811 0, /* tp_setattro */
9812 0, /* tp_as_buffer */
9813 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9814 0, /* tp_doc */
9815 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9816 0, /* tp_clear */
9817 0, /* tp_richcompare */
9818 0, /* tp_weaklistoffset */
9819 PyObject_SelfIter, /* tp_iter */
9820 (iternextfunc)unicodeiter_next, /* tp_iternext */
9821 unicodeiter_methods, /* tp_methods */
9822 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009823};
9824
9825static PyObject *
9826unicode_iter(PyObject *seq)
9827{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009829
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 if (!PyUnicode_Check(seq)) {
9831 PyErr_BadInternalCall();
9832 return NULL;
9833 }
9834 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9835 if (it == NULL)
9836 return NULL;
9837 it->it_index = 0;
9838 Py_INCREF(seq);
9839 it->it_seq = (PyUnicodeObject *)seq;
9840 _PyObject_GC_TRACK(it);
9841 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009842}
9843
Martin v. Löwis5b222132007-06-10 09:51:05 +00009844size_t
9845Py_UNICODE_strlen(const Py_UNICODE *u)
9846{
9847 int res = 0;
9848 while(*u++)
9849 res++;
9850 return res;
9851}
9852
9853Py_UNICODE*
9854Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9855{
9856 Py_UNICODE *u = s1;
9857 while ((*u++ = *s2++));
9858 return s1;
9859}
9860
9861Py_UNICODE*
9862Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9863{
9864 Py_UNICODE *u = s1;
9865 while ((*u++ = *s2++))
9866 if (n-- == 0)
9867 break;
9868 return s1;
9869}
9870
9871int
9872Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9873{
9874 while (*s1 && *s2 && *s1 == *s2)
9875 s1++, s2++;
9876 if (*s1 && *s2)
9877 return (*s1 < *s2) ? -1 : +1;
9878 if (*s1)
9879 return 1;
9880 if (*s2)
9881 return -1;
9882 return 0;
9883}
9884
9885Py_UNICODE*
9886Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9887{
9888 const Py_UNICODE *p;
9889 for (p = s; *p; p++)
9890 if (*p == c)
9891 return (Py_UNICODE*)p;
9892 return NULL;
9893}
9894
9895
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009896#ifdef __cplusplus
9897}
9898#endif
9899
9900
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009901/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 Local variables:
9903 c-basic-offset: 4
9904 indent-tabs-mode: nil
9905 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009906*/