blob: 6ad73e0899c5413c5e95d559abf662d284d5b956 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000794 unsigned char *s = va_arg(count, unsigned char*);
795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis5b222132007-06-10 09:51:05 +00001533char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001534_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001535{
Christian Heimesf3863112007-11-22 07:46:41 +00001536 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001537 if (!PyUnicode_Check(unicode)) {
1538 PyErr_BadArgument();
1539 return NULL;
1540 }
Christian Heimesf3863112007-11-22 07:46:41 +00001541 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1542 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001543 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001544 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001545 *psize = PyBytes_GET_SIZE(bytes);
1546 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001547}
1548
1549char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001550_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001551{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001552 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001553}
1554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1556{
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561 return PyUnicode_AS_UNICODE(unicode);
1562
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 return NULL;
1565}
1566
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
1569 if (!PyUnicode_Check(unicode)) {
1570 PyErr_BadArgument();
1571 goto onError;
1572 }
1573 return PyUnicode_GET_SIZE(unicode);
1574
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 return -1;
1577}
1578
Thomas Wouters78890102000-07-22 19:25:51 +00001579const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001580{
1581 return unicode_default_encoding;
1582}
1583
1584int PyUnicode_SetDefaultEncoding(const char *encoding)
1585{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001586 if (strcmp(encoding, unicode_default_encoding) != 0) {
1587 PyErr_Format(PyExc_ValueError,
1588 "Can only set default encoding to %s",
1589 unicode_default_encoding);
1590 return -1;
1591 }
Fred Drakee4315f52000-05-09 19:53:39 +00001592 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001593}
1594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001595/* error handling callback helper:
1596 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001597 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 and adjust various state variables.
1599 return 0 on success, -1 on error
1600*/
1601
1602static
1603int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 const char *encoding, const char *reason,
1605 const char **input, const char **inend, Py_ssize_t *startinpos,
1606 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1607 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001609 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610
1611 PyObject *restuple = NULL;
1612 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001613 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001615 Py_ssize_t requiredsize;
1616 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001618 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001619 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 int res = -1;
1621
1622 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001623 *errorHandler = PyCodec_LookupError(errors);
1624 if (*errorHandler == NULL)
1625 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 }
1627
1628 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001629 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001630 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1631 if (*exceptionObject == NULL)
1632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 }
1634 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001635 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1636 goto onError;
1637 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1638 goto onError;
1639 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1640 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001641 }
1642
1643 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1644 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001645 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001647 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 }
1650 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001652
1653 /* Copy back the bytes variables, which might have been modified by the
1654 callback */
1655 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1656 if (!inputobj)
1657 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001658 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001660 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001661 *input = PyBytes_AS_STRING(inputobj);
1662 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001663 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001664 /* we can DECREF safely, as the exception has another reference,
1665 so the object won't go away. */
1666 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001670 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1672 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001673 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674
1675 /* need more space? (at least enough for what we
1676 have+the replacement+the rest of the string (starting
1677 at the new input position), so we won't have to check space
1678 when there are no errors in the rest of the string) */
1679 repptr = PyUnicode_AS_UNICODE(repunicode);
1680 repsize = PyUnicode_GET_SIZE(repunicode);
1681 requiredsize = *outpos + repsize + insize-newpos;
1682 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001683 if (requiredsize<2*outsize)
1684 requiredsize = 2*outsize;
1685 if (_PyUnicode_Resize(output, requiredsize) < 0)
1686 goto onError;
1687 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 }
1689 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001690 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 Py_UNICODE_COPY(*outptr, repptr, repsize);
1692 *outptr += repsize;
1693 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 /* we made it! */
1696 res = 0;
1697
Benjamin Peterson29060642009-01-31 22:14:21 +00001698 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 Py_XDECREF(restuple);
1700 return res;
1701}
1702
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703/* --- UTF-7 Codec -------------------------------------------------------- */
1704
1705/* see RFC2152 for details */
1706
Tim Petersced69f82003-09-16 20:30:58 +00001707static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708char utf7_special[128] = {
1709 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1710 encoded:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001711 0 - not special
1712 1 - special
1713 2 - whitespace (optional)
1714 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1716 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1718 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1719 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1721 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1723
1724};
1725
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001726/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1727 warnings about the comparison always being false; since
1728 utf7_special[0] is 1, we can safely make that one comparison
1729 true */
1730
Benjamin Peterson29060642009-01-31 22:14:21 +00001731#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001732 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson29060642009-01-31 22:14:21 +00001733 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 (encodeO && (utf7_special[(c)] == 3)))
1735
Benjamin Peterson29060642009-01-31 22:14:21 +00001736#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001737 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson29060642009-01-31 22:14:21 +00001738#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001739 (ISALNUM(c) || (c) == '+' || (c) == '/')
Benjamin Peterson29060642009-01-31 22:14:21 +00001740#define UB64(c) \
1741 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001742 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001744#define ENCODE(out, ch, bits) \
1745 while (bits >= 6) { \
1746 *out++ = B64(ch >> (bits-6)); \
1747 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 }
1749
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001750#define DECODE(out, ch, bits, surrogate) \
1751 while (bits >= 16) { \
1752 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1753 bits -= 16; \
1754 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001755 /* We have already generated an error for the high surrogate \
1756 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001757 surrogate = 0; \
1758 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001760 it in a 16-bit character */ \
1761 surrogate = 1; \
1762 errmsg = "code pairs are not supported"; \
1763 goto utf7Error; \
1764 } else { \
1765 *out++ = outCh; \
1766 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001767 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 Py_ssize_t size,
1771 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001773 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1774}
1775
1776PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 Py_ssize_t size,
1778 const char *errors,
1779 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001782 Py_ssize_t startinpos;
1783 Py_ssize_t endinpos;
1784 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 const char *e;
1786 PyUnicodeObject *unicode;
1787 Py_UNICODE *p;
1788 const char *errmsg = "";
1789 int inShift = 0;
1790 unsigned int bitsleft = 0;
1791 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 int surrogate = 0;
1793 PyObject *errorHandler = NULL;
1794 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001795
1796 unicode = _PyUnicode_New(size);
1797 if (!unicode)
1798 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001799 if (size == 0) {
1800 if (consumed)
1801 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001803 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804
1805 p = unicode->str;
1806 e = s + size;
1807
1808 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001810 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001811 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001812
1813 if (inShift) {
1814 if ((ch == '-') || !B64CHAR(ch)) {
1815 inShift = 0;
1816 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001817
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1819 if (bitsleft >= 6) {
1820 /* The shift sequence has a partial character in it. If
1821 bitsleft < 6 then we could just classify it as padding
1822 but that is not the case here */
1823
1824 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001825 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001826 }
1827 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001828 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829 here so indicate the potential of a misencoded character. */
1830
1831 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1832 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1833 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001834 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835 }
1836
1837 if (ch == '-') {
1838 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001839 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 inShift = 1;
1841 }
1842 } else if (SPECIAL(ch,0,0)) {
1843 errmsg = "unexpected special character";
Benjamin Peterson14339b62009-01-31 16:36:08 +00001844 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 } else {
1846 *p++ = ch;
1847 }
1848 } else {
1849 charsleft = (charsleft << 6) | UB64(ch);
1850 bitsleft += 6;
1851 s++;
1852 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1853 }
1854 }
1855 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001857 s++;
1858 if (s < e && *s == '-') {
1859 s++;
1860 *p++ = '+';
1861 } else
1862 {
1863 inShift = 1;
1864 bitsleft = 0;
1865 }
1866 }
1867 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001868 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869 errmsg = "unexpected special character";
1870 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001871 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 }
1873 else {
1874 *p++ = ch;
1875 s++;
1876 }
1877 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00001878 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001879 outpos = p-PyUnicode_AS_UNICODE(unicode);
1880 endinpos = s-starts;
1881 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001882 errors, &errorHandler,
1883 "utf7", errmsg,
1884 &starts, &e, &startinpos, &endinpos, &exc, &s,
1885 &unicode, &outpos, &p))
1886 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887 }
1888
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001889 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 outpos = p-PyUnicode_AS_UNICODE(unicode);
1891 endinpos = size;
1892 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 errors, &errorHandler,
1894 "utf7", "unterminated shift sequence",
1895 &starts, &e, &startinpos, &endinpos, &exc, &s,
1896 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 if (s < e)
Benjamin Peterson29060642009-01-31 22:14:21 +00001899 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001900 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001901 if (consumed) {
1902 if(inShift)
1903 *consumed = startinpos;
1904 else
1905 *consumed = s-starts;
1906 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001907
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001908 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 goto onError;
1910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 Py_XDECREF(errorHandler);
1912 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001913 return (PyObject *)unicode;
1914
Benjamin Peterson29060642009-01-31 22:14:21 +00001915 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001916 Py_XDECREF(errorHandler);
1917 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001918 Py_DECREF(unicode);
1919 return NULL;
1920}
1921
1922
1923PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001924 Py_ssize_t size,
1925 int encodeSetO,
1926 int encodeWhiteSpace,
1927 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001928{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001929 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001930 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001931 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001933 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934 unsigned int bitsleft = 0;
1935 unsigned long charsleft = 0;
1936 char * out;
1937 char * start;
1938
1939 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001942 if (cbAllocated / 5 != size)
1943 return PyErr_NoMemory();
1944
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001945 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001946 if (v == NULL)
1947 return NULL;
1948
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001949 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950 for (;i < size; ++i) {
1951 Py_UNICODE ch = s[i];
1952
1953 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001954 if (ch == '+') {
1955 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956 *out++ = '-';
1957 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1958 charsleft = ch;
1959 bitsleft = 16;
1960 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001961 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001963 } else {
1964 *out++ = (char) ch;
1965 }
1966 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001967 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1968 *out++ = B64(charsleft << (6-bitsleft));
1969 charsleft = 0;
1970 bitsleft = 0;
1971 /* Characters not in the BASE64 set implicitly unshift the sequence
1972 so no '-' is required, except if the character is itself a '-' */
1973 if (B64CHAR(ch) || ch == '-') {
1974 *out++ = '-';
1975 }
1976 inShift = 0;
1977 *out++ = (char) ch;
1978 } else {
1979 bitsleft += 16;
1980 charsleft = (charsleft << 16) | ch;
1981 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1982
Mark Dickinson934896d2009-02-21 20:59:32 +00001983 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001984 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001985 or '-' then the shift sequence will be terminated implicitly and we
1986 don't have to insert a '-'. */
1987
1988 if (bitsleft == 0) {
1989 if (i + 1 < size) {
1990 Py_UNICODE ch2 = s[i+1];
1991
1992 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001993
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994 } else if (B64CHAR(ch2) || ch2 == '-') {
1995 *out++ = '-';
1996 inShift = 0;
1997 } else {
1998 inShift = 0;
1999 }
2000
2001 }
2002 else {
2003 *out++ = '-';
2004 inShift = 0;
2005 }
2006 }
Tim Petersced69f82003-09-16 20:30:58 +00002007 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002008 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002009 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 if (bitsleft) {
2011 *out++= B64(charsleft << (6-bitsleft) );
2012 *out++ = '-';
2013 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002014 if (_PyBytes_Resize(&v, out - start) < 0)
2015 return NULL;
2016 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002017}
2018
2019#undef SPECIAL
2020#undef B64
2021#undef B64CHAR
2022#undef UB64
2023#undef ENCODE
2024#undef DECODE
2025
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026/* --- UTF-8 Codec -------------------------------------------------------- */
2027
Tim Petersced69f82003-09-16 20:30:58 +00002028static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029char utf8_code_length[256] = {
2030 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2031 illegal prefix. see RFC 2279 for details */
2032 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2040 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2041 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2042 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2046 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2047 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2048};
2049
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002051 Py_ssize_t size,
2052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053{
Walter Dörwald69652032004-09-07 20:24:22 +00002054 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2055}
2056
Antoine Pitrouab868312009-01-10 15:40:25 +00002057/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2058#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2059
2060/* Mask to quickly check whether a C 'long' contains a
2061 non-ASCII, UTF8-encoded char. */
2062#if (SIZEOF_LONG == 8)
2063# define ASCII_CHAR_MASK 0x8080808080808080L
2064#elif (SIZEOF_LONG == 4)
2065# define ASCII_CHAR_MASK 0x80808080L
2066#else
2067# error C 'long' size should be either 4 or 8!
2068#endif
2069
Walter Dörwald69652032004-09-07 20:24:22 +00002070PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 Py_ssize_t size,
2072 const char *errors,
2073 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002074{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002077 Py_ssize_t startinpos;
2078 Py_ssize_t endinpos;
2079 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002080 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 PyUnicodeObject *unicode;
2082 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002083 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 PyObject *errorHandler = NULL;
2085 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086
2087 /* Note: size will always be longer than the resulting Unicode
2088 character count */
2089 unicode = _PyUnicode_New(size);
2090 if (!unicode)
2091 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002092 if (size == 0) {
2093 if (consumed)
2094 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 /* Unpack UTF-8 encoded data */
2099 p = unicode->str;
2100 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002101 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102
2103 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002104 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105
2106 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002107 /* Fast path for runs of ASCII characters. Given that common UTF-8
2108 input will consist of an overwhelming majority of ASCII
2109 characters, we try to optimize for this case by checking
2110 as many characters as a C 'long' can contain.
2111 First, check if we can do an aligned read, as most CPUs have
2112 a penalty for unaligned reads.
2113 */
2114 if (!((size_t) s & LONG_PTR_MASK)) {
2115 /* Help register allocation */
2116 register const char *_s = s;
2117 register Py_UNICODE *_p = p;
2118 while (_s < aligned_end) {
2119 /* Read a whole long at a time (either 4 or 8 bytes),
2120 and do a fast unrolled copy if it only contains ASCII
2121 characters. */
2122 unsigned long data = *(unsigned long *) _s;
2123 if (data & ASCII_CHAR_MASK)
2124 break;
2125 _p[0] = (unsigned char) _s[0];
2126 _p[1] = (unsigned char) _s[1];
2127 _p[2] = (unsigned char) _s[2];
2128 _p[3] = (unsigned char) _s[3];
2129#if (SIZEOF_LONG == 8)
2130 _p[4] = (unsigned char) _s[4];
2131 _p[5] = (unsigned char) _s[5];
2132 _p[6] = (unsigned char) _s[6];
2133 _p[7] = (unsigned char) _s[7];
2134#endif
2135 _s += SIZEOF_LONG;
2136 _p += SIZEOF_LONG;
2137 }
2138 s = _s;
2139 p = _p;
2140 if (s == e)
2141 break;
2142 ch = (unsigned char)*s;
2143 }
2144 }
2145
2146 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002147 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 s++;
2149 continue;
2150 }
2151
2152 n = utf8_code_length[ch];
2153
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002154 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002155 if (consumed)
2156 break;
2157 else {
2158 errmsg = "unexpected end of data";
2159 startinpos = s-starts;
2160 endinpos = size;
2161 goto utf8Error;
2162 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164
2165 switch (n) {
2166
2167 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002168 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002169 startinpos = s-starts;
2170 endinpos = startinpos+1;
2171 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172
2173 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002174 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 startinpos = s-starts;
2176 endinpos = startinpos+1;
2177 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178
2179 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002180 if ((s[1] & 0xc0) != 0x80) {
2181 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002182 startinpos = s-starts;
2183 endinpos = startinpos+2;
2184 goto utf8Error;
2185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002187 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 startinpos = s-starts;
2189 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002190 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002191 goto utf8Error;
2192 }
2193 else
2194 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 break;
2196
2197 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002198 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002199 (s[2] & 0xc0) != 0x80) {
2200 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002201 startinpos = s-starts;
2202 endinpos = startinpos+3;
2203 goto utf8Error;
2204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002206 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002207 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002208 startinpos = s-starts;
2209 endinpos = startinpos+3;
2210 goto utf8Error;
2211 }
2212 else
2213 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 break;
2215
2216 case 4:
2217 if ((s[1] & 0xc0) != 0x80 ||
2218 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002219 (s[3] & 0xc0) != 0x80) {
2220 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 startinpos = s-starts;
2222 endinpos = startinpos+4;
2223 goto utf8Error;
2224 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002225 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002226 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002227 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002228 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002229 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002230 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002231 UTF-16 */
2232 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002233 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002234 startinpos = s-starts;
2235 endinpos = startinpos+4;
2236 goto utf8Error;
2237 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002238#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002240#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002241 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002242
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 /* translate from 10000..10FFFF to 0..FFFF */
2244 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002245
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002246 /* high surrogate = top 10 bits added to D800 */
2247 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002248
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002249 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002250 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002251#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 break;
2253
2254 default:
2255 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002256 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002257 startinpos = s-starts;
2258 endinpos = startinpos+n;
2259 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 }
2261 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002262 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002263
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 utf8Error:
2265 outpos = p-PyUnicode_AS_UNICODE(unicode);
2266 if (unicode_decode_call_errorhandler(
2267 errors, &errorHandler,
2268 "utf8", errmsg,
2269 &starts, &e, &startinpos, &endinpos, &exc, &s,
2270 &unicode, &outpos, &p))
2271 goto onError;
2272 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
Walter Dörwald69652032004-09-07 20:24:22 +00002274 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276
2277 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002278 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 goto onError;
2280
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002281 Py_XDECREF(errorHandler);
2282 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 return (PyObject *)unicode;
2284
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 Py_XDECREF(errorHandler);
2287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 Py_DECREF(unicode);
2289 return NULL;
2290}
2291
Antoine Pitrouab868312009-01-10 15:40:25 +00002292#undef ASCII_CHAR_MASK
2293
2294
Tim Peters602f7402002-04-27 18:03:26 +00002295/* Allocation strategy: if the string is short, convert into a stack buffer
2296 and allocate exactly as much space needed at the end. Else allocate the
2297 maximum possible needed (4 result bytes per Unicode character), and return
2298 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002299*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002300PyObject *
2301PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 Py_ssize_t size,
2303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304{
Tim Peters602f7402002-04-27 18:03:26 +00002305#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002306
Guido van Rossum98297ee2007-11-06 21:34:58 +00002307 Py_ssize_t i; /* index into s of next input byte */
2308 PyObject *result; /* result string object */
2309 char *p; /* next free byte in output buffer */
2310 Py_ssize_t nallocated; /* number of result bytes allocated */
2311 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002312 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002313 PyObject *errorHandler = NULL;
2314 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002315
Tim Peters602f7402002-04-27 18:03:26 +00002316 assert(s != NULL);
2317 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
Tim Peters602f7402002-04-27 18:03:26 +00002319 if (size <= MAX_SHORT_UNICHARS) {
2320 /* Write into the stack buffer; nallocated can't overflow.
2321 * At the end, we'll allocate exactly as much heap space as it
2322 * turns out we need.
2323 */
2324 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002325 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002326 p = stackbuf;
2327 }
2328 else {
2329 /* Overallocate on the heap, and give the excess back at the end. */
2330 nallocated = size * 4;
2331 if (nallocated / 4 != size) /* overflow! */
2332 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002333 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002334 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002335 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002336 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002337 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002338
Tim Peters602f7402002-04-27 18:03:26 +00002339 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002340 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002341
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002342 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002343 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002345
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002347 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002348 *p++ = (char)(0xc0 | (ch >> 6));
2349 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002350 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002351 else {
Tim Peters602f7402002-04-27 18:03:26 +00002352 /* Encode UCS2 Unicode ordinals */
2353 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002354#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002355 /* Special case: check for high surrogate */
2356 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2357 Py_UCS4 ch2 = s[i];
2358 /* Check for low surrogate and combine the two to
2359 form a UCS4 value */
2360 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002361 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002362 i++;
2363 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002364 }
Tim Peters602f7402002-04-27 18:03:26 +00002365 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002366 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002367#endif
2368 if (ch >= 0xd800 && ch <= 0xdfff) {
2369 Py_ssize_t newpos;
2370 PyObject *rep;
2371 char *prep;
2372 int k;
2373 rep = unicode_encode_call_errorhandler
2374 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2375 s, size, &exc, i-1, i, &newpos);
2376 if (!rep)
2377 goto error;
2378 /* Implementation limitations: only support error handler that return
2379 bytes, and only support up to four replacement bytes. */
2380 if (!PyBytes_Check(rep)) {
2381 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2382 Py_DECREF(rep);
2383 goto error;
2384 }
2385 if (PyBytes_Size(rep) > 4) {
2386 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2387 Py_DECREF(rep);
2388 goto error;
2389 }
2390 prep = PyBytes_AsString(rep);
2391 for(k = PyBytes_Size(rep); k > 0; k--)
2392 *p++ = *prep++;
2393 Py_DECREF(rep);
2394 continue;
2395
2396 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002397 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002398 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2399 *p++ = (char)(0x80 | (ch & 0x3f));
2400 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002403 /* Encode UCS4 Unicode ordinals */
2404 *p++ = (char)(0xf0 | (ch >> 18));
2405 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2406 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2407 *p++ = (char)(0x80 | (ch & 0x3f));
2408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002410
Guido van Rossum98297ee2007-11-06 21:34:58 +00002411 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002412 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002413 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002414 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002415 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002416 }
2417 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002418 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002419 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002420 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002421 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002422 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002425 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002426 error:
2427 Py_XDECREF(errorHandler);
2428 Py_XDECREF(exc);
2429 Py_XDECREF(result);
2430 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002431
Tim Peters602f7402002-04-27 18:03:26 +00002432#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433}
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 if (!PyUnicode_Check(unicode)) {
2438 PyErr_BadArgument();
2439 return NULL;
2440 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002441 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002442 PyUnicode_GET_SIZE(unicode),
2443 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444}
2445
Walter Dörwald41980ca2007-08-16 21:55:45 +00002446/* --- UTF-32 Codec ------------------------------------------------------- */
2447
2448PyObject *
2449PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 Py_ssize_t size,
2451 const char *errors,
2452 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002453{
2454 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2455}
2456
2457PyObject *
2458PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 Py_ssize_t size,
2460 const char *errors,
2461 int *byteorder,
2462 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002463{
2464 const char *starts = s;
2465 Py_ssize_t startinpos;
2466 Py_ssize_t endinpos;
2467 Py_ssize_t outpos;
2468 PyUnicodeObject *unicode;
2469 Py_UNICODE *p;
2470#ifndef Py_UNICODE_WIDE
2471 int i, pairs;
2472#else
2473 const int pairs = 0;
2474#endif
2475 const unsigned char *q, *e;
2476 int bo = 0; /* assume native ordering by default */
2477 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002478 /* Offsets from q for retrieving bytes in the right order. */
2479#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2480 int iorder[] = {0, 1, 2, 3};
2481#else
2482 int iorder[] = {3, 2, 1, 0};
2483#endif
2484 PyObject *errorHandler = NULL;
2485 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002486 /* On narrow builds we split characters outside the BMP into two
2487 codepoints => count how much extra space we need. */
2488#ifndef Py_UNICODE_WIDE
2489 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 if (((Py_UCS4 *)s)[i] >= 0x10000)
2491 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002492#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002493
2494 /* This might be one to much, because of a BOM */
2495 unicode = _PyUnicode_New((size+3)/4+pairs);
2496 if (!unicode)
2497 return NULL;
2498 if (size == 0)
2499 return (PyObject *)unicode;
2500
2501 /* Unpack UTF-32 encoded data */
2502 p = unicode->str;
2503 q = (unsigned char *)s;
2504 e = q + size;
2505
2506 if (byteorder)
2507 bo = *byteorder;
2508
2509 /* Check for BOM marks (U+FEFF) in the input and adjust current
2510 byte order setting accordingly. In native mode, the leading BOM
2511 mark is skipped, in all other modes, it is copied to the output
2512 stream as-is (giving a ZWNBSP character). */
2513 if (bo == 0) {
2514 if (size >= 4) {
2515 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002516 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 if (bom == 0x0000FEFF) {
2519 q += 4;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE0000) {
2523 q += 4;
2524 bo = 1;
2525 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002526#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002527 if (bom == 0x0000FEFF) {
2528 q += 4;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE0000) {
2532 q += 4;
2533 bo = -1;
2534 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002535#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002536 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002537 }
2538
2539 if (bo == -1) {
2540 /* force LE */
2541 iorder[0] = 0;
2542 iorder[1] = 1;
2543 iorder[2] = 2;
2544 iorder[3] = 3;
2545 }
2546 else if (bo == 1) {
2547 /* force BE */
2548 iorder[0] = 3;
2549 iorder[1] = 2;
2550 iorder[2] = 1;
2551 iorder[3] = 0;
2552 }
2553
2554 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 Py_UCS4 ch;
2556 /* remaining bytes at the end? (size should be divisible by 4) */
2557 if (e-q<4) {
2558 if (consumed)
2559 break;
2560 errmsg = "truncated data";
2561 startinpos = ((const char *)q)-starts;
2562 endinpos = ((const char *)e)-starts;
2563 goto utf32Error;
2564 /* The remaining input chars are ignored if the callback
2565 chooses to skip the input */
2566 }
2567 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2568 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002569
Benjamin Peterson29060642009-01-31 22:14:21 +00002570 if (ch >= 0x110000)
2571 {
2572 errmsg = "codepoint not in range(0x110000)";
2573 startinpos = ((const char *)q)-starts;
2574 endinpos = startinpos+4;
2575 goto utf32Error;
2576 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002577#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002578 if (ch >= 0x10000)
2579 {
2580 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2581 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2582 }
2583 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002584#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002585 *p++ = ch;
2586 q += 4;
2587 continue;
2588 utf32Error:
2589 outpos = p-PyUnicode_AS_UNICODE(unicode);
2590 if (unicode_decode_call_errorhandler(
2591 errors, &errorHandler,
2592 "utf32", errmsg,
2593 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2594 &unicode, &outpos, &p))
2595 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002596 }
2597
2598 if (byteorder)
2599 *byteorder = bo;
2600
2601 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002602 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002603
2604 /* Adjust length */
2605 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2606 goto onError;
2607
2608 Py_XDECREF(errorHandler);
2609 Py_XDECREF(exc);
2610 return (PyObject *)unicode;
2611
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002613 Py_DECREF(unicode);
2614 Py_XDECREF(errorHandler);
2615 Py_XDECREF(exc);
2616 return NULL;
2617}
2618
2619PyObject *
2620PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 Py_ssize_t size,
2622 const char *errors,
2623 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002624{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002625 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002626 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002627 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002628#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002629 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002630#else
2631 const int pairs = 0;
2632#endif
2633 /* Offsets from p for storing byte pairs in the right order. */
2634#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2635 int iorder[] = {0, 1, 2, 3};
2636#else
2637 int iorder[] = {3, 2, 1, 0};
2638#endif
2639
Benjamin Peterson29060642009-01-31 22:14:21 +00002640#define STORECHAR(CH) \
2641 do { \
2642 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2643 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2644 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2645 p[iorder[0]] = (CH) & 0xff; \
2646 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647 } while(0)
2648
2649 /* In narrow builds we can output surrogate pairs as one codepoint,
2650 so we need less space. */
2651#ifndef Py_UNICODE_WIDE
2652 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2654 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2655 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002656#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002657 nsize = (size - pairs + (byteorder == 0));
2658 bytesize = nsize * 4;
2659 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002661 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002662 if (v == NULL)
2663 return NULL;
2664
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002665 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002666 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002667 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002668 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002669 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002670
2671 if (byteorder == -1) {
2672 /* force LE */
2673 iorder[0] = 0;
2674 iorder[1] = 1;
2675 iorder[2] = 2;
2676 iorder[3] = 3;
2677 }
2678 else if (byteorder == 1) {
2679 /* force BE */
2680 iorder[0] = 3;
2681 iorder[1] = 2;
2682 iorder[2] = 1;
2683 iorder[3] = 0;
2684 }
2685
2686 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002688#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2690 Py_UCS4 ch2 = *s;
2691 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2692 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2693 s++;
2694 size--;
2695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002697#endif
2698 STORECHAR(ch);
2699 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002700
2701 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002702 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002703#undef STORECHAR
2704}
2705
2706PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2707{
2708 if (!PyUnicode_Check(unicode)) {
2709 PyErr_BadArgument();
2710 return NULL;
2711 }
2712 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 PyUnicode_GET_SIZE(unicode),
2714 NULL,
2715 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002716}
2717
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718/* --- UTF-16 Codec ------------------------------------------------------- */
2719
Tim Peters772747b2001-08-09 22:21:55 +00002720PyObject *
2721PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 Py_ssize_t size,
2723 const char *errors,
2724 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725{
Walter Dörwald69652032004-09-07 20:24:22 +00002726 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2727}
2728
Antoine Pitrouab868312009-01-10 15:40:25 +00002729/* Two masks for fast checking of whether a C 'long' may contain
2730 UTF16-encoded surrogate characters. This is an efficient heuristic,
2731 assuming that non-surrogate characters with a code point >= 0x8000 are
2732 rare in most input.
2733 FAST_CHAR_MASK is used when the input is in native byte ordering,
2734 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002735*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002736#if (SIZEOF_LONG == 8)
2737# define FAST_CHAR_MASK 0x8000800080008000L
2738# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2739#elif (SIZEOF_LONG == 4)
2740# define FAST_CHAR_MASK 0x80008000L
2741# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2742#else
2743# error C 'long' size should be either 4 or 8!
2744#endif
2745
Walter Dörwald69652032004-09-07 20:24:22 +00002746PyObject *
2747PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 Py_ssize_t size,
2749 const char *errors,
2750 int *byteorder,
2751 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002754 Py_ssize_t startinpos;
2755 Py_ssize_t endinpos;
2756 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 PyUnicodeObject *unicode;
2758 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002759 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002760 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002761 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002762 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002763 /* Offsets from q for retrieving byte pairs in the right order. */
2764#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2765 int ihi = 1, ilo = 0;
2766#else
2767 int ihi = 0, ilo = 1;
2768#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 PyObject *errorHandler = NULL;
2770 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771
2772 /* Note: size will always be longer than the resulting Unicode
2773 character count */
2774 unicode = _PyUnicode_New(size);
2775 if (!unicode)
2776 return NULL;
2777 if (size == 0)
2778 return (PyObject *)unicode;
2779
2780 /* Unpack UTF-16 encoded data */
2781 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002782 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002783 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784
2785 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002786 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002788 /* Check for BOM marks (U+FEFF) in the input and adjust current
2789 byte order setting accordingly. In native mode, the leading BOM
2790 mark is skipped, in all other modes, it is copied to the output
2791 stream as-is (giving a ZWNBSP character). */
2792 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002793 if (size >= 2) {
2794 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002795#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 if (bom == 0xFEFF) {
2797 q += 2;
2798 bo = -1;
2799 }
2800 else if (bom == 0xFFFE) {
2801 q += 2;
2802 bo = 1;
2803 }
Tim Petersced69f82003-09-16 20:30:58 +00002804#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 if (bom == 0xFEFF) {
2806 q += 2;
2807 bo = 1;
2808 }
2809 else if (bom == 0xFFFE) {
2810 q += 2;
2811 bo = -1;
2812 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002813#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816
Tim Peters772747b2001-08-09 22:21:55 +00002817 if (bo == -1) {
2818 /* force LE */
2819 ihi = 1;
2820 ilo = 0;
2821 }
2822 else if (bo == 1) {
2823 /* force BE */
2824 ihi = 0;
2825 ilo = 1;
2826 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002827#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2828 native_ordering = ilo < ihi;
2829#else
2830 native_ordering = ilo > ihi;
2831#endif
Tim Peters772747b2001-08-09 22:21:55 +00002832
Antoine Pitrouab868312009-01-10 15:40:25 +00002833 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002834 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002836 /* First check for possible aligned read of a C 'long'. Unaligned
2837 reads are more expensive, better to defer to another iteration. */
2838 if (!((size_t) q & LONG_PTR_MASK)) {
2839 /* Fast path for runs of non-surrogate chars. */
2840 register const unsigned char *_q = q;
2841 Py_UNICODE *_p = p;
2842 if (native_ordering) {
2843 /* Native ordering is simple: as long as the input cannot
2844 possibly contain a surrogate char, do an unrolled copy
2845 of several 16-bit code points to the target object.
2846 The non-surrogate check is done on several input bytes
2847 at a time (as many as a C 'long' can contain). */
2848 while (_q < aligned_end) {
2849 unsigned long data = * (unsigned long *) _q;
2850 if (data & FAST_CHAR_MASK)
2851 break;
2852 _p[0] = ((unsigned short *) _q)[0];
2853 _p[1] = ((unsigned short *) _q)[1];
2854#if (SIZEOF_LONG == 8)
2855 _p[2] = ((unsigned short *) _q)[2];
2856 _p[3] = ((unsigned short *) _q)[3];
2857#endif
2858 _q += SIZEOF_LONG;
2859 _p += SIZEOF_LONG / 2;
2860 }
2861 }
2862 else {
2863 /* Byteswapped ordering is similar, but we must decompose
2864 the copy bytewise, and take care of zero'ing out the
2865 upper bytes if the target object is in 32-bit units
2866 (that is, in UCS-4 builds). */
2867 while (_q < aligned_end) {
2868 unsigned long data = * (unsigned long *) _q;
2869 if (data & SWAPPED_FAST_CHAR_MASK)
2870 break;
2871 /* Zero upper bytes in UCS-4 builds */
2872#if (Py_UNICODE_SIZE > 2)
2873 _p[0] = 0;
2874 _p[1] = 0;
2875#if (SIZEOF_LONG == 8)
2876 _p[2] = 0;
2877 _p[3] = 0;
2878#endif
2879#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002880 /* Issue #4916; UCS-4 builds on big endian machines must
2881 fill the two last bytes of each 4-byte unit. */
2882#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2883# define OFF 2
2884#else
2885# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002886#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002887 ((unsigned char *) _p)[OFF + 1] = _q[0];
2888 ((unsigned char *) _p)[OFF + 0] = _q[1];
2889 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2890 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2891#if (SIZEOF_LONG == 8)
2892 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2893 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2894 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2895 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2896#endif
2897#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002898 _q += SIZEOF_LONG;
2899 _p += SIZEOF_LONG / 2;
2900 }
2901 }
2902 p = _p;
2903 q = _q;
2904 if (q >= e)
2905 break;
2906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson14339b62009-01-31 16:36:08 +00002909 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002910
2911 if (ch < 0xD800 || ch > 0xDFFF) {
2912 *p++ = ch;
2913 continue;
2914 }
2915
2916 /* UTF-16 code pair: */
2917 if (q > e) {
2918 errmsg = "unexpected end of data";
2919 startinpos = (((const char *)q) - 2) - starts;
2920 endinpos = ((const char *)e) + 1 - starts;
2921 goto utf16Error;
2922 }
2923 if (0xD800 <= ch && ch <= 0xDBFF) {
2924 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2925 q += 2;
2926 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002927#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 *p++ = ch;
2929 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002930#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002932#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 continue;
2934 }
2935 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002936 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 startinpos = (((const char *)q)-4)-starts;
2938 endinpos = startinpos+2;
2939 goto utf16Error;
2940 }
2941
Benjamin Peterson14339b62009-01-31 16:36:08 +00002942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 errmsg = "illegal encoding";
2944 startinpos = (((const char *)q)-2)-starts;
2945 endinpos = startinpos+2;
2946 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002947
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 utf16Error:
2949 outpos = p - PyUnicode_AS_UNICODE(unicode);
2950 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002951 errors,
2952 &errorHandler,
2953 "utf16", errmsg,
2954 &starts,
2955 (const char **)&e,
2956 &startinpos,
2957 &endinpos,
2958 &exc,
2959 (const char **)&q,
2960 &unicode,
2961 &outpos,
2962 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00002963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002965 /* remaining byte at the end? (size should be even) */
2966 if (e == q) {
2967 if (!consumed) {
2968 errmsg = "truncated data";
2969 startinpos = ((const char *)q) - starts;
2970 endinpos = ((const char *)e) + 1 - starts;
2971 outpos = p - PyUnicode_AS_UNICODE(unicode);
2972 if (unicode_decode_call_errorhandler(
2973 errors,
2974 &errorHandler,
2975 "utf16", errmsg,
2976 &starts,
2977 (const char **)&e,
2978 &startinpos,
2979 &endinpos,
2980 &exc,
2981 (const char **)&q,
2982 &unicode,
2983 &outpos,
2984 &p))
2985 goto onError;
2986 /* The remaining input chars are ignored if the callback
2987 chooses to skip the input */
2988 }
2989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990
2991 if (byteorder)
2992 *byteorder = bo;
2993
Walter Dörwald69652032004-09-07 20:24:22 +00002994 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002996
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002998 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 goto onError;
3000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001 Py_XDECREF(errorHandler);
3002 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 return (PyObject *)unicode;
3004
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 Py_XDECREF(errorHandler);
3008 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 return NULL;
3010}
3011
Antoine Pitrouab868312009-01-10 15:40:25 +00003012#undef FAST_CHAR_MASK
3013#undef SWAPPED_FAST_CHAR_MASK
3014
Tim Peters772747b2001-08-09 22:21:55 +00003015PyObject *
3016PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 Py_ssize_t size,
3018 const char *errors,
3019 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003021 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003022 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003023 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003024#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003025 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003026#else
3027 const int pairs = 0;
3028#endif
Tim Peters772747b2001-08-09 22:21:55 +00003029 /* Offsets from p for storing byte pairs in the right order. */
3030#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3031 int ihi = 1, ilo = 0;
3032#else
3033 int ihi = 0, ilo = 1;
3034#endif
3035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036#define STORECHAR(CH) \
3037 do { \
3038 p[ihi] = ((CH) >> 8) & 0xff; \
3039 p[ilo] = (CH) & 0xff; \
3040 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003041 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003043#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003044 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 if (s[i] >= 0x10000)
3046 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003047#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003048 /* 2 * (size + pairs + (byteorder == 0)) */
3049 if (size > PY_SSIZE_T_MAX ||
3050 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003052 nsize = size + pairs + (byteorder == 0);
3053 bytesize = nsize * 2;
3054 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003056 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 if (v == NULL)
3058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003060 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003063 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003064 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003065
3066 if (byteorder == -1) {
3067 /* force LE */
3068 ihi = 1;
3069 ilo = 0;
3070 }
3071 else if (byteorder == 1) {
3072 /* force BE */
3073 ihi = 0;
3074 ilo = 1;
3075 }
3076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003077 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 Py_UNICODE ch = *s++;
3079 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003080#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 if (ch >= 0x10000) {
3082 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3083 ch = 0xD800 | ((ch-0x10000) >> 10);
3084 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003085#endif
Tim Peters772747b2001-08-09 22:21:55 +00003086 STORECHAR(ch);
3087 if (ch2)
3088 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003089 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003090
3091 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003092 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003093#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094}
3095
3096PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3097{
3098 if (!PyUnicode_Check(unicode)) {
3099 PyErr_BadArgument();
3100 return NULL;
3101 }
3102 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 PyUnicode_GET_SIZE(unicode),
3104 NULL,
3105 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106}
3107
3108/* --- Unicode Escape Codec ----------------------------------------------- */
3109
Fredrik Lundh06d12682001-01-24 07:59:11 +00003110static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003111
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 Py_ssize_t size,
3114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t startinpos;
3118 Py_ssize_t endinpos;
3119 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003124 char* message;
3125 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 PyObject *errorHandler = NULL;
3127 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003128
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 /* Escaped strings will always be longer than the resulting
3130 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 length after conversion to the true value.
3132 (but if the error callback returns a long replacement string
3133 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 v = _PyUnicode_New(size);
3135 if (v == NULL)
3136 goto onError;
3137 if (size == 0)
3138 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003142
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 while (s < end) {
3144 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003145 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147
3148 /* Non-escape characters are interpreted as Unicode ordinals */
3149 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003150 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 continue;
3152 }
3153
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 /* \ - Escapes */
3156 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003157 c = *s++;
3158 if (s > end)
3159 c = '\0'; /* Invalid after \ */
3160 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 case '\n': break;
3164 case '\\': *p++ = '\\'; break;
3165 case '\'': *p++ = '\''; break;
3166 case '\"': *p++ = '\"'; break;
3167 case 'b': *p++ = '\b'; break;
3168 case 'f': *p++ = '\014'; break; /* FF */
3169 case 't': *p++ = '\t'; break;
3170 case 'n': *p++ = '\n'; break;
3171 case 'r': *p++ = '\r'; break;
3172 case 'v': *p++ = '\013'; break; /* VT */
3173 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3174
Benjamin Peterson29060642009-01-31 22:14:21 +00003175 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 case '0': case '1': case '2': case '3':
3177 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003178 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003179 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003180 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003181 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003182 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003184 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 break;
3186
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 /* hex escapes */
3188 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003190 digits = 2;
3191 message = "truncated \\xXX escape";
3192 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003196 digits = 4;
3197 message = "truncated \\uXXXX escape";
3198 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003201 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003202 digits = 8;
3203 message = "truncated \\UXXXXXXXX escape";
3204 hexescape:
3205 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 outpos = p-PyUnicode_AS_UNICODE(v);
3207 if (s+digits>end) {
3208 endinpos = size;
3209 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 errors, &errorHandler,
3211 "unicodeescape", "end of string in escape sequence",
3212 &starts, &end, &startinpos, &endinpos, &exc, &s,
3213 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214 goto onError;
3215 goto nextByte;
3216 }
3217 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003218 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003219 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 endinpos = (s+i+1)-starts;
3221 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 errors, &errorHandler,
3223 "unicodeescape", message,
3224 &starts, &end, &startinpos, &endinpos, &exc, &s,
3225 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003226 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003228 }
3229 chr = (chr<<4) & ~0xF;
3230 if (c >= '0' && c <= '9')
3231 chr += c - '0';
3232 else if (c >= 'a' && c <= 'f')
3233 chr += 10 + c - 'a';
3234 else
3235 chr += 10 + c - 'A';
3236 }
3237 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003238 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 /* _decoding_error will have already written into the
3240 target buffer. */
3241 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003242 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003243 /* when we get here, chr is a 32-bit unicode character */
3244 if (chr <= 0xffff)
3245 /* UCS-2 character */
3246 *p++ = (Py_UNICODE) chr;
3247 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003248 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003249 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003250#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003251 *p++ = chr;
3252#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003253 chr -= 0x10000L;
3254 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003255 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003256#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003257 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258 endinpos = s-starts;
3259 outpos = p-PyUnicode_AS_UNICODE(v);
3260 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 errors, &errorHandler,
3262 "unicodeescape", "illegal Unicode character",
3263 &starts, &end, &startinpos, &endinpos, &exc, &s,
3264 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003265 goto onError;
3266 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003267 break;
3268
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003270 case 'N':
3271 message = "malformed \\N character escape";
3272 if (ucnhash_CAPI == NULL) {
3273 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003274 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003275 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003276 if (m == NULL)
3277 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003278 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003279 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003280 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003281 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003282 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003283 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003284 if (ucnhash_CAPI == NULL)
3285 goto ucnhashError;
3286 }
3287 if (*s == '{') {
3288 const char *start = s+1;
3289 /* look for the closing brace */
3290 while (*s != '}' && s < end)
3291 s++;
3292 if (s > start && s < end && *s == '}') {
3293 /* found a name. look it up in the unicode database */
3294 message = "unknown Unicode character name";
3295 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003296 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003297 goto store;
3298 }
3299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 endinpos = s-starts;
3301 outpos = p-PyUnicode_AS_UNICODE(v);
3302 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 errors, &errorHandler,
3304 "unicodeescape", message,
3305 &starts, &end, &startinpos, &endinpos, &exc, &s,
3306 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003307 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003308 break;
3309
3310 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003311 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 message = "\\ at end of string";
3313 s--;
3314 endinpos = s-starts;
3315 outpos = p-PyUnicode_AS_UNICODE(v);
3316 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 errors, &errorHandler,
3318 "unicodeescape", message,
3319 &starts, &end, &startinpos, &endinpos, &exc, &s,
3320 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003321 goto onError;
3322 }
3323 else {
3324 *p++ = '\\';
3325 *p++ = (unsigned char)s[-1];
3326 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003327 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003329 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003332 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003334 Py_XDECREF(errorHandler);
3335 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003337
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003339 PyErr_SetString(
3340 PyExc_UnicodeError,
3341 "\\N escapes not supported (can't load unicodedata module)"
3342 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003343 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344 Py_XDECREF(errorHandler);
3345 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003346 return NULL;
3347
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 Py_XDECREF(errorHandler);
3351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return NULL;
3353}
3354
3355/* Return a Unicode-Escape string version of the Unicode object.
3356
3357 If quotes is true, the string is enclosed in u"" or u'' quotes as
3358 appropriate.
3359
3360*/
3361
Thomas Wouters477c8d52006-05-27 19:21:47 +00003362Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 Py_ssize_t size,
3364 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003365{
3366 /* like wcschr, but doesn't stop at NULL characters */
3367
3368 while (size-- > 0) {
3369 if (*s == ch)
3370 return s;
3371 s++;
3372 }
3373
3374 return NULL;
3375}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003376
Walter Dörwald79e913e2007-05-12 11:08:06 +00003377static const char *hexdigits = "0123456789abcdef";
3378
3379PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003382 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003385#ifdef Py_UNICODE_WIDE
3386 const Py_ssize_t expandsize = 10;
3387#else
3388 const Py_ssize_t expandsize = 6;
3389#endif
3390
Thomas Wouters89f507f2006-12-13 04:49:30 +00003391 /* XXX(nnorwitz): rather than over-allocating, it would be
3392 better to choose a different scheme. Perhaps scan the
3393 first N-chars of the string and allocate based on that size.
3394 */
3395 /* Initial allocation is based on the longest-possible unichr
3396 escape.
3397
3398 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3399 unichr, so in this case it's the longest unichr escape. In
3400 narrow (UTF-16) builds this is five chars per source unichr
3401 since there are two unichrs in the surrogate pair, so in narrow
3402 (UTF-16) builds it's not the longest unichr escape.
3403
3404 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3405 so in the narrow (UTF-16) build case it's the longest unichr
3406 escape.
3407 */
3408
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003409 if (size == 0)
3410 return PyBytes_FromStringAndSize(NULL, 0);
3411
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003412 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003414
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003415 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003416 2
3417 + expandsize*size
3418 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (repr == NULL)
3420 return NULL;
3421
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003422 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 while (size-- > 0) {
3425 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003426
Walter Dörwald79e913e2007-05-12 11:08:06 +00003427 /* Escape backslashes */
3428 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 *p++ = '\\';
3430 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003432 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003433
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003434#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003435 /* Map 21-bit characters to '\U00xxxxxx' */
3436 else if (ch >= 0x10000) {
3437 *p++ = '\\';
3438 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003439 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3440 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3441 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3442 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3443 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3444 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3445 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3446 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003447 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003448 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003449#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3451 else if (ch >= 0xD800 && ch < 0xDC00) {
3452 Py_UNICODE ch2;
3453 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003454
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 ch2 = *s++;
3456 size--;
3457 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3458 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3459 *p++ = '\\';
3460 *p++ = 'U';
3461 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3462 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3463 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3464 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3465 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3466 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3467 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3468 *p++ = hexdigits[ucs & 0x0000000F];
3469 continue;
3470 }
3471 /* Fall through: isolated surrogates are copied as-is */
3472 s--;
3473 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003474 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003475#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003476
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003478 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 *p++ = '\\';
3480 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003481 *p++ = hexdigits[(ch >> 12) & 0x000F];
3482 *p++ = hexdigits[(ch >> 8) & 0x000F];
3483 *p++ = hexdigits[(ch >> 4) & 0x000F];
3484 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003486
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003487 /* Map special whitespace to '\t', \n', '\r' */
3488 else if (ch == '\t') {
3489 *p++ = '\\';
3490 *p++ = 't';
3491 }
3492 else if (ch == '\n') {
3493 *p++ = '\\';
3494 *p++ = 'n';
3495 }
3496 else if (ch == '\r') {
3497 *p++ = '\\';
3498 *p++ = 'r';
3499 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003500
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003501 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003502 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003504 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003505 *p++ = hexdigits[(ch >> 4) & 0x000F];
3506 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003507 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003508
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 /* Copy everything else as-is */
3510 else
3511 *p++ = (char) ch;
3512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003514 assert(p - PyBytes_AS_STRING(repr) > 0);
3515 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3516 return NULL;
3517 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518}
3519
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003520PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003522 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 return NULL;
3526 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003527 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3528 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003529 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530}
3531
3532/* --- Raw Unicode Escape Codec ------------------------------------------- */
3533
3534PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 Py_ssize_t size,
3536 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003539 Py_ssize_t startinpos;
3540 Py_ssize_t endinpos;
3541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 const char *end;
3545 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 PyObject *errorHandler = NULL;
3547 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 /* Escaped strings will always be longer than the resulting
3550 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 length after conversion to the true value. (But decoding error
3552 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 v = _PyUnicode_New(size);
3554 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 end = s + size;
3560 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 unsigned char c;
3562 Py_UCS4 x;
3563 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003564 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 /* Non-escape characters are interpreted as Unicode ordinals */
3567 if (*s != '\\') {
3568 *p++ = (unsigned char)*s++;
3569 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 startinpos = s-starts;
3572
3573 /* \u-escapes are only interpreted iff the number of leading
3574 backslashes if odd */
3575 bs = s;
3576 for (;s < end;) {
3577 if (*s != '\\')
3578 break;
3579 *p++ = (unsigned char)*s++;
3580 }
3581 if (((s - bs) & 1) == 0 ||
3582 s >= end ||
3583 (*s != 'u' && *s != 'U')) {
3584 continue;
3585 }
3586 p--;
3587 count = *s=='u' ? 4 : 8;
3588 s++;
3589
3590 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3591 outpos = p-PyUnicode_AS_UNICODE(v);
3592 for (x = 0, i = 0; i < count; ++i, ++s) {
3593 c = (unsigned char)*s;
3594 if (!ISXDIGIT(c)) {
3595 endinpos = s-starts;
3596 if (unicode_decode_call_errorhandler(
3597 errors, &errorHandler,
3598 "rawunicodeescape", "truncated \\uXXXX",
3599 &starts, &end, &startinpos, &endinpos, &exc, &s,
3600 &v, &outpos, &p))
3601 goto onError;
3602 goto nextByte;
3603 }
3604 x = (x<<4) & ~0xF;
3605 if (c >= '0' && c <= '9')
3606 x += c - '0';
3607 else if (c >= 'a' && c <= 'f')
3608 x += 10 + c - 'a';
3609 else
3610 x += 10 + c - 'A';
3611 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003612 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 /* UCS-2 character */
3614 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003615 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 /* UCS-4 character. Either store directly, or as
3617 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003618#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003619 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003620#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 x -= 0x10000L;
3622 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3623 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003624#endif
3625 } else {
3626 endinpos = s-starts;
3627 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003628 if (unicode_decode_call_errorhandler(
3629 errors, &errorHandler,
3630 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003631 &starts, &end, &startinpos, &endinpos, &exc, &s,
3632 &v, &outpos, &p))
3633 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 nextByte:
3636 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003638 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 Py_XDECREF(errorHandler);
3641 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003643
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 Py_XDECREF(errorHandler);
3647 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 return NULL;
3649}
3650
3651PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003654 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 char *p;
3656 char *q;
3657
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003658#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003659 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003660#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003661 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003662#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003663
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003664 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003666
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003667 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 if (repr == NULL)
3669 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003670 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003671 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003673 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 while (size-- > 0) {
3675 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003676#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 /* Map 32-bit characters to '\Uxxxxxxxx' */
3678 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003679 *p++ = '\\';
3680 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003681 *p++ = hexdigits[(ch >> 28) & 0xf];
3682 *p++ = hexdigits[(ch >> 24) & 0xf];
3683 *p++ = hexdigits[(ch >> 20) & 0xf];
3684 *p++ = hexdigits[(ch >> 16) & 0xf];
3685 *p++ = hexdigits[(ch >> 12) & 0xf];
3686 *p++ = hexdigits[(ch >> 8) & 0xf];
3687 *p++ = hexdigits[(ch >> 4) & 0xf];
3688 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003689 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003690 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003691#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3693 if (ch >= 0xD800 && ch < 0xDC00) {
3694 Py_UNICODE ch2;
3695 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003696
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 ch2 = *s++;
3698 size--;
3699 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3700 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3701 *p++ = '\\';
3702 *p++ = 'U';
3703 *p++ = hexdigits[(ucs >> 28) & 0xf];
3704 *p++ = hexdigits[(ucs >> 24) & 0xf];
3705 *p++ = hexdigits[(ucs >> 20) & 0xf];
3706 *p++ = hexdigits[(ucs >> 16) & 0xf];
3707 *p++ = hexdigits[(ucs >> 12) & 0xf];
3708 *p++ = hexdigits[(ucs >> 8) & 0xf];
3709 *p++ = hexdigits[(ucs >> 4) & 0xf];
3710 *p++ = hexdigits[ucs & 0xf];
3711 continue;
3712 }
3713 /* Fall through: isolated surrogates are copied as-is */
3714 s--;
3715 size++;
3716 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003717#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 /* Map 16-bit characters to '\uxxxx' */
3719 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 *p++ = '\\';
3721 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003722 *p++ = hexdigits[(ch >> 12) & 0xf];
3723 *p++ = hexdigits[(ch >> 8) & 0xf];
3724 *p++ = hexdigits[(ch >> 4) & 0xf];
3725 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 /* Copy everything else as-is */
3728 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 *p++ = (char) ch;
3730 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003731 size = p - q;
3732
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003733 assert(size > 0);
3734 if (_PyBytes_Resize(&repr, size) < 0)
3735 return NULL;
3736 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737}
3738
3739PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3740{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003741 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003743 PyErr_BadArgument();
3744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003746 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3747 PyUnicode_GET_SIZE(unicode));
3748
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003749 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750}
3751
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003752/* --- Unicode Internal Codec ------------------------------------------- */
3753
3754PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 Py_ssize_t size,
3756 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003757{
3758 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003759 Py_ssize_t startinpos;
3760 Py_ssize_t endinpos;
3761 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003762 PyUnicodeObject *v;
3763 Py_UNICODE *p;
3764 const char *end;
3765 const char *reason;
3766 PyObject *errorHandler = NULL;
3767 PyObject *exc = NULL;
3768
Neal Norwitzd43069c2006-01-08 01:12:10 +00003769#ifdef Py_UNICODE_WIDE
3770 Py_UNICODE unimax = PyUnicode_GetMax();
3771#endif
3772
Thomas Wouters89f507f2006-12-13 04:49:30 +00003773 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003774 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3775 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003777 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003779 p = PyUnicode_AS_UNICODE(v);
3780 end = s + size;
3781
3782 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003783 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003784 /* We have to sanity check the raw data, otherwise doom looms for
3785 some malformed UCS-4 data. */
3786 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003787#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003788 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003789#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003790 end-s < Py_UNICODE_SIZE
3791 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003792 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003793 startinpos = s - starts;
3794 if (end-s < Py_UNICODE_SIZE) {
3795 endinpos = end-starts;
3796 reason = "truncated input";
3797 }
3798 else {
3799 endinpos = s - starts + Py_UNICODE_SIZE;
3800 reason = "illegal code point (> 0x10FFFF)";
3801 }
3802 outpos = p - PyUnicode_AS_UNICODE(v);
3803 if (unicode_decode_call_errorhandler(
3804 errors, &errorHandler,
3805 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003806 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003807 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003808 goto onError;
3809 }
3810 }
3811 else {
3812 p++;
3813 s += Py_UNICODE_SIZE;
3814 }
3815 }
3816
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003817 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003818 goto onError;
3819 Py_XDECREF(errorHandler);
3820 Py_XDECREF(exc);
3821 return (PyObject *)v;
3822
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003824 Py_XDECREF(v);
3825 Py_XDECREF(errorHandler);
3826 Py_XDECREF(exc);
3827 return NULL;
3828}
3829
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830/* --- Latin-1 Codec ------------------------------------------------------ */
3831
3832PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 Py_ssize_t size,
3834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835{
3836 PyUnicodeObject *v;
3837 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003838 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003839
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003841 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 Py_UNICODE r = *(unsigned char*)s;
3843 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003844 }
3845
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 v = _PyUnicode_New(size);
3847 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003852 e = s + size;
3853 /* Unrolling the copy makes it much faster by reducing the looping
3854 overhead. This is similar to what many memcpy() implementations do. */
3855 unrolled_end = e - 4;
3856 while (s < unrolled_end) {
3857 p[0] = (unsigned char) s[0];
3858 p[1] = (unsigned char) s[1];
3859 p[2] = (unsigned char) s[2];
3860 p[3] = (unsigned char) s[3];
3861 s += 4;
3862 p += 4;
3863 }
3864 while (s < e)
3865 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003867
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 Py_XDECREF(v);
3870 return NULL;
3871}
3872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873/* create or adjust a UnicodeEncodeError */
3874static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003875 const char *encoding,
3876 const Py_UNICODE *unicode, Py_ssize_t size,
3877 Py_ssize_t startpos, Py_ssize_t endpos,
3878 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 *exceptionObject = PyUnicodeEncodeError_Create(
3882 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
3884 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003885 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3886 goto onError;
3887 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3888 goto onError;
3889 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3890 goto onError;
3891 return;
3892 onError:
3893 Py_DECREF(*exceptionObject);
3894 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 }
3896}
3897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898/* raises a UnicodeEncodeError */
3899static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003900 const char *encoding,
3901 const Py_UNICODE *unicode, Py_ssize_t size,
3902 Py_ssize_t startpos, Py_ssize_t endpos,
3903 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904{
3905 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909}
3910
3911/* error handling callback helper:
3912 build arguments, call the callback and check the arguments,
3913 put the result into newpos and return the replacement string, which
3914 has to be freed by the caller */
3915static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 PyObject **errorHandler,
3917 const char *encoding, const char *reason,
3918 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3919 Py_ssize_t startpos, Py_ssize_t endpos,
3920 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003922 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923
3924 PyObject *restuple;
3925 PyObject *resunicode;
3926
3927 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 }
3932
3933 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937
3938 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003943 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 Py_DECREF(restuple);
3945 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003947 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 &resunicode, newpos)) {
3949 Py_DECREF(restuple);
3950 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003952 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
3953 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3954 Py_DECREF(restuple);
3955 return NULL;
3956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003958 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003959 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3961 Py_DECREF(restuple);
3962 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003963 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 Py_INCREF(resunicode);
3965 Py_DECREF(restuple);
3966 return resunicode;
3967}
3968
3969static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 Py_ssize_t size,
3971 const char *errors,
3972 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973{
3974 /* output object */
3975 PyObject *res;
3976 /* pointers to the beginning and end+1 of input */
3977 const Py_UNICODE *startp = p;
3978 const Py_UNICODE *endp = p + size;
3979 /* pointer to the beginning of the unencodable characters */
3980 /* const Py_UNICODE *badp = NULL; */
3981 /* pointer into the output */
3982 char *str;
3983 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003985 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3986 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 PyObject *errorHandler = NULL;
3988 PyObject *exc = NULL;
3989 /* the following variable is used for caching string comparisons
3990 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3991 int known_errorHandler = -1;
3992
3993 /* allocate enough for a simple encoding without
3994 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003995 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003996 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003997 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003999 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004000 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 ressize = size;
4002
4003 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 /* can we encode this? */
4007 if (c<limit) {
4008 /* no overflow check, because we know that the space is enough */
4009 *str++ = (char)c;
4010 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004012 else {
4013 Py_ssize_t unicodepos = p-startp;
4014 Py_ssize_t requiredsize;
4015 PyObject *repunicode;
4016 Py_ssize_t repsize;
4017 Py_ssize_t newpos;
4018 Py_ssize_t respos;
4019 Py_UNICODE *uni2;
4020 /* startpos for collecting unencodable chars */
4021 const Py_UNICODE *collstart = p;
4022 const Py_UNICODE *collend = p;
4023 /* find all unecodable characters */
4024 while ((collend < endp) && ((*collend)>=limit))
4025 ++collend;
4026 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4027 if (known_errorHandler==-1) {
4028 if ((errors==NULL) || (!strcmp(errors, "strict")))
4029 known_errorHandler = 1;
4030 else if (!strcmp(errors, "replace"))
4031 known_errorHandler = 2;
4032 else if (!strcmp(errors, "ignore"))
4033 known_errorHandler = 3;
4034 else if (!strcmp(errors, "xmlcharrefreplace"))
4035 known_errorHandler = 4;
4036 else
4037 known_errorHandler = 0;
4038 }
4039 switch (known_errorHandler) {
4040 case 1: /* strict */
4041 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4042 goto onError;
4043 case 2: /* replace */
4044 while (collstart++<collend)
4045 *str++ = '?'; /* fall through */
4046 case 3: /* ignore */
4047 p = collend;
4048 break;
4049 case 4: /* xmlcharrefreplace */
4050 respos = str - PyBytes_AS_STRING(res);
4051 /* determine replacement size (temporarily (mis)uses p) */
4052 for (p = collstart, repsize = 0; p < collend; ++p) {
4053 if (*p<10)
4054 repsize += 2+1+1;
4055 else if (*p<100)
4056 repsize += 2+2+1;
4057 else if (*p<1000)
4058 repsize += 2+3+1;
4059 else if (*p<10000)
4060 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004061#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 else
4063 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004064#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 else if (*p<100000)
4066 repsize += 2+5+1;
4067 else if (*p<1000000)
4068 repsize += 2+6+1;
4069 else
4070 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004071#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004072 }
4073 requiredsize = respos+repsize+(endp-collend);
4074 if (requiredsize > ressize) {
4075 if (requiredsize<2*ressize)
4076 requiredsize = 2*ressize;
4077 if (_PyBytes_Resize(&res, requiredsize))
4078 goto onError;
4079 str = PyBytes_AS_STRING(res) + respos;
4080 ressize = requiredsize;
4081 }
4082 /* generate replacement (temporarily (mis)uses p) */
4083 for (p = collstart; p < collend; ++p) {
4084 str += sprintf(str, "&#%d;", (int)*p);
4085 }
4086 p = collend;
4087 break;
4088 default:
4089 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4090 encoding, reason, startp, size, &exc,
4091 collstart-startp, collend-startp, &newpos);
4092 if (repunicode == NULL)
4093 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004094 if (!PyUnicode_Check(repunicode)) {
4095 /* Implementation limitation: byte results not supported yet. */
4096 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
4097 Py_DECREF(repunicode);
4098 goto onError;
4099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 /* need more space? (at least enough for what we
4101 have+the replacement+the rest of the string, so
4102 we won't have to check space for encodable characters) */
4103 respos = str - PyBytes_AS_STRING(res);
4104 repsize = PyUnicode_GET_SIZE(repunicode);
4105 requiredsize = respos+repsize+(endp-collend);
4106 if (requiredsize > ressize) {
4107 if (requiredsize<2*ressize)
4108 requiredsize = 2*ressize;
4109 if (_PyBytes_Resize(&res, requiredsize)) {
4110 Py_DECREF(repunicode);
4111 goto onError;
4112 }
4113 str = PyBytes_AS_STRING(res) + respos;
4114 ressize = requiredsize;
4115 }
4116 /* check if there is anything unencodable in the replacement
4117 and copy it to the output */
4118 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4119 c = *uni2;
4120 if (c >= limit) {
4121 raise_encode_exception(&exc, encoding, startp, size,
4122 unicodepos, unicodepos+1, reason);
4123 Py_DECREF(repunicode);
4124 goto onError;
4125 }
4126 *str = (char)c;
4127 }
4128 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004129 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004131 }
4132 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004133 /* Resize if we allocated to much */
4134 size = str - PyBytes_AS_STRING(res);
4135 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004136 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004137 if (_PyBytes_Resize(&res, size) < 0)
4138 goto onError;
4139 }
4140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 Py_XDECREF(errorHandler);
4142 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004143 return res;
4144
4145 onError:
4146 Py_XDECREF(res);
4147 Py_XDECREF(errorHandler);
4148 Py_XDECREF(exc);
4149 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150}
4151
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 Py_ssize_t size,
4154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157}
4158
4159PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4160{
4161 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 PyErr_BadArgument();
4163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 }
4165 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 PyUnicode_GET_SIZE(unicode),
4167 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168}
4169
4170/* --- 7-bit ASCII Codec -------------------------------------------------- */
4171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 Py_ssize_t size,
4174 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 PyUnicodeObject *v;
4178 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004179 Py_ssize_t startinpos;
4180 Py_ssize_t endinpos;
4181 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 const char *e;
4183 PyObject *errorHandler = NULL;
4184 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004185
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004187 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 Py_UNICODE r = *(unsigned char*)s;
4189 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004190 }
Tim Petersced69f82003-09-16 20:30:58 +00004191
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 v = _PyUnicode_New(size);
4193 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 e = s + size;
4199 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 register unsigned char c = (unsigned char)*s;
4201 if (c < 128) {
4202 *p++ = c;
4203 ++s;
4204 }
4205 else {
4206 startinpos = s-starts;
4207 endinpos = startinpos + 1;
4208 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4209 if (unicode_decode_call_errorhandler(
4210 errors, &errorHandler,
4211 "ascii", "ordinal not in range(128)",
4212 &starts, &e, &startinpos, &endinpos, &exc, &s,
4213 &v, &outpos, &p))
4214 goto onError;
4215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004217 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4219 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 Py_XDECREF(errorHandler);
4221 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004223
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 Py_XDECREF(errorHandler);
4227 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 return NULL;
4229}
4230
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 Py_ssize_t size,
4233 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236}
4237
4238PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4239{
4240 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 PyErr_BadArgument();
4242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 }
4244 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 PyUnicode_GET_SIZE(unicode),
4246 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247}
4248
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004249#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004250
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004251/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004252
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004253#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004254#define NEED_RETRY
4255#endif
4256
4257/* XXX This code is limited to "true" double-byte encodings, as
4258 a) it assumes an incomplete character consists of a single byte, and
4259 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004261
4262static int is_dbcs_lead_byte(const char *s, int offset)
4263{
4264 const char *curr = s + offset;
4265
4266 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 const char *prev = CharPrev(s, curr);
4268 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004269 }
4270 return 0;
4271}
4272
4273/*
4274 * Decode MBCS string into unicode object. If 'final' is set, converts
4275 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4276 */
4277static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 const char *s, /* MBCS string */
4279 int size, /* sizeof MBCS string */
4280 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004281{
4282 Py_UNICODE *p;
4283 Py_ssize_t n = 0;
4284 int usize = 0;
4285
4286 assert(size >= 0);
4287
4288 /* Skip trailing lead-byte unless 'final' is set */
4289 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004291
4292 /* First get the size of the result */
4293 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4295 if (usize == 0) {
4296 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4297 return -1;
4298 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004299 }
4300
4301 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 /* Create unicode object */
4303 *v = _PyUnicode_New(usize);
4304 if (*v == NULL)
4305 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004306 }
4307 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 /* Extend unicode object */
4309 n = PyUnicode_GET_SIZE(*v);
4310 if (_PyUnicode_Resize(v, n + usize) < 0)
4311 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004312 }
4313
4314 /* Do the conversion */
4315 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 p = PyUnicode_AS_UNICODE(*v) + n;
4317 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4318 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4319 return -1;
4320 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004321 }
4322
4323 return size;
4324}
4325
4326PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 Py_ssize_t size,
4328 const char *errors,
4329 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004330{
4331 PyUnicodeObject *v = NULL;
4332 int done;
4333
4334 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004336
4337#ifdef NEED_RETRY
4338 retry:
4339 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004341 else
4342#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004344
4345 if (done < 0) {
4346 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004348 }
4349
4350 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004352
4353#ifdef NEED_RETRY
4354 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 s += done;
4356 size -= done;
4357 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004358 }
4359#endif
4360
4361 return (PyObject *)v;
4362}
4363
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004364PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 Py_ssize_t size,
4366 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004368 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4369}
4370
4371/*
4372 * Convert unicode into string object (MBCS).
4373 * Returns 0 if succeed, -1 otherwise.
4374 */
4375static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 const Py_UNICODE *p, /* unicode */
4377 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004378{
4379 int mbcssize = 0;
4380 Py_ssize_t n = 0;
4381
4382 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004383
4384 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004385 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4387 if (mbcssize == 0) {
4388 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4389 return -1;
4390 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004391 }
4392
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004393 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 /* Create string object */
4395 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4396 if (*repr == NULL)
4397 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004398 }
4399 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 /* Extend string object */
4401 n = PyBytes_Size(*repr);
4402 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4403 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004404 }
4405
4406 /* Do the conversion */
4407 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 char *s = PyBytes_AS_STRING(*repr) + n;
4409 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4410 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4411 return -1;
4412 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004413 }
4414
4415 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004416}
4417
4418PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_ssize_t size,
4420 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004421{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004422 PyObject *repr = NULL;
4423 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004424
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004425#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004427 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004429 else
4430#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004432
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004433 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 Py_XDECREF(repr);
4435 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004436 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004437
4438#ifdef NEED_RETRY
4439 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 p += INT_MAX;
4441 size -= INT_MAX;
4442 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004443 }
4444#endif
4445
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004446 return repr;
4447}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004448
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004449PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4450{
4451 if (!PyUnicode_Check(unicode)) {
4452 PyErr_BadArgument();
4453 return NULL;
4454 }
4455 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 PyUnicode_GET_SIZE(unicode),
4457 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004458}
4459
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004460#undef NEED_RETRY
4461
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004462#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004463
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464/* --- Character Mapping Codec -------------------------------------------- */
4465
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 Py_ssize_t size,
4468 PyObject *mapping,
4469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 Py_ssize_t startinpos;
4473 Py_ssize_t endinpos;
4474 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 PyUnicodeObject *v;
4477 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004478 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 PyObject *errorHandler = NULL;
4480 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004481 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004482 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 /* Default to Latin-1 */
4485 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487
4488 v = _PyUnicode_New(size);
4489 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004495 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 mapstring = PyUnicode_AS_UNICODE(mapping);
4497 maplen = PyUnicode_GET_SIZE(mapping);
4498 while (s < e) {
4499 unsigned char ch = *s;
4500 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 if (ch < maplen)
4503 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 if (x == 0xfffe) {
4506 /* undefined mapping */
4507 outpos = p-PyUnicode_AS_UNICODE(v);
4508 startinpos = s-starts;
4509 endinpos = startinpos+1;
4510 if (unicode_decode_call_errorhandler(
4511 errors, &errorHandler,
4512 "charmap", "character maps to <undefined>",
4513 &starts, &e, &startinpos, &endinpos, &exc, &s,
4514 &v, &outpos, &p)) {
4515 goto onError;
4516 }
4517 continue;
4518 }
4519 *p++ = x;
4520 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004521 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004522 }
4523 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 while (s < e) {
4525 unsigned char ch = *s;
4526 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004527
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4529 w = PyLong_FromLong((long)ch);
4530 if (w == NULL)
4531 goto onError;
4532 x = PyObject_GetItem(mapping, w);
4533 Py_DECREF(w);
4534 if (x == NULL) {
4535 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4536 /* No mapping found means: mapping is undefined. */
4537 PyErr_Clear();
4538 x = Py_None;
4539 Py_INCREF(x);
4540 } else
4541 goto onError;
4542 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004543
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 /* Apply mapping */
4545 if (PyLong_Check(x)) {
4546 long value = PyLong_AS_LONG(x);
4547 if (value < 0 || value > 65535) {
4548 PyErr_SetString(PyExc_TypeError,
4549 "character mapping must be in range(65536)");
4550 Py_DECREF(x);
4551 goto onError;
4552 }
4553 *p++ = (Py_UNICODE)value;
4554 }
4555 else if (x == Py_None) {
4556 /* undefined mapping */
4557 outpos = p-PyUnicode_AS_UNICODE(v);
4558 startinpos = s-starts;
4559 endinpos = startinpos+1;
4560 if (unicode_decode_call_errorhandler(
4561 errors, &errorHandler,
4562 "charmap", "character maps to <undefined>",
4563 &starts, &e, &startinpos, &endinpos, &exc, &s,
4564 &v, &outpos, &p)) {
4565 Py_DECREF(x);
4566 goto onError;
4567 }
4568 Py_DECREF(x);
4569 continue;
4570 }
4571 else if (PyUnicode_Check(x)) {
4572 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004573
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 if (targetsize == 1)
4575 /* 1-1 mapping */
4576 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004577
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 else if (targetsize > 1) {
4579 /* 1-n mapping */
4580 if (targetsize > extrachars) {
4581 /* resize first */
4582 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4583 Py_ssize_t needed = (targetsize - extrachars) + \
4584 (targetsize << 2);
4585 extrachars += needed;
4586 /* XXX overflow detection missing */
4587 if (_PyUnicode_Resize(&v,
4588 PyUnicode_GET_SIZE(v) + needed) < 0) {
4589 Py_DECREF(x);
4590 goto onError;
4591 }
4592 p = PyUnicode_AS_UNICODE(v) + oldpos;
4593 }
4594 Py_UNICODE_COPY(p,
4595 PyUnicode_AS_UNICODE(x),
4596 targetsize);
4597 p += targetsize;
4598 extrachars -= targetsize;
4599 }
4600 /* 1-0 mapping: skip the character */
4601 }
4602 else {
4603 /* wrong return value */
4604 PyErr_SetString(PyExc_TypeError,
4605 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004606 Py_DECREF(x);
4607 goto onError;
4608 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 Py_DECREF(x);
4610 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 }
4613 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 Py_XDECREF(errorHandler);
4617 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004619
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 Py_XDECREF(errorHandler);
4622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 Py_XDECREF(v);
4624 return NULL;
4625}
4626
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004627/* Charmap encoding: the lookup table */
4628
4629struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 PyObject_HEAD
4631 unsigned char level1[32];
4632 int count2, count3;
4633 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004634};
4635
4636static PyObject*
4637encoding_map_size(PyObject *obj, PyObject* args)
4638{
4639 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004640 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004642}
4643
4644static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004645 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 PyDoc_STR("Return the size (in bytes) of this object") },
4647 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004648};
4649
4650static void
4651encoding_map_dealloc(PyObject* o)
4652{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004653 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004654}
4655
4656static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004657 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 "EncodingMap", /*tp_name*/
4659 sizeof(struct encoding_map), /*tp_basicsize*/
4660 0, /*tp_itemsize*/
4661 /* methods */
4662 encoding_map_dealloc, /*tp_dealloc*/
4663 0, /*tp_print*/
4664 0, /*tp_getattr*/
4665 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004666 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 0, /*tp_repr*/
4668 0, /*tp_as_number*/
4669 0, /*tp_as_sequence*/
4670 0, /*tp_as_mapping*/
4671 0, /*tp_hash*/
4672 0, /*tp_call*/
4673 0, /*tp_str*/
4674 0, /*tp_getattro*/
4675 0, /*tp_setattro*/
4676 0, /*tp_as_buffer*/
4677 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4678 0, /*tp_doc*/
4679 0, /*tp_traverse*/
4680 0, /*tp_clear*/
4681 0, /*tp_richcompare*/
4682 0, /*tp_weaklistoffset*/
4683 0, /*tp_iter*/
4684 0, /*tp_iternext*/
4685 encoding_map_methods, /*tp_methods*/
4686 0, /*tp_members*/
4687 0, /*tp_getset*/
4688 0, /*tp_base*/
4689 0, /*tp_dict*/
4690 0, /*tp_descr_get*/
4691 0, /*tp_descr_set*/
4692 0, /*tp_dictoffset*/
4693 0, /*tp_init*/
4694 0, /*tp_alloc*/
4695 0, /*tp_new*/
4696 0, /*tp_free*/
4697 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004698};
4699
4700PyObject*
4701PyUnicode_BuildEncodingMap(PyObject* string)
4702{
4703 Py_UNICODE *decode;
4704 PyObject *result;
4705 struct encoding_map *mresult;
4706 int i;
4707 int need_dict = 0;
4708 unsigned char level1[32];
4709 unsigned char level2[512];
4710 unsigned char *mlevel1, *mlevel2, *mlevel3;
4711 int count2 = 0, count3 = 0;
4712
4713 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4714 PyErr_BadArgument();
4715 return NULL;
4716 }
4717 decode = PyUnicode_AS_UNICODE(string);
4718 memset(level1, 0xFF, sizeof level1);
4719 memset(level2, 0xFF, sizeof level2);
4720
4721 /* If there isn't a one-to-one mapping of NULL to \0,
4722 or if there are non-BMP characters, we need to use
4723 a mapping dictionary. */
4724 if (decode[0] != 0)
4725 need_dict = 1;
4726 for (i = 1; i < 256; i++) {
4727 int l1, l2;
4728 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004729#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004730 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004731#endif
4732 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004733 need_dict = 1;
4734 break;
4735 }
4736 if (decode[i] == 0xFFFE)
4737 /* unmapped character */
4738 continue;
4739 l1 = decode[i] >> 11;
4740 l2 = decode[i] >> 7;
4741 if (level1[l1] == 0xFF)
4742 level1[l1] = count2++;
4743 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004744 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004745 }
4746
4747 if (count2 >= 0xFF || count3 >= 0xFF)
4748 need_dict = 1;
4749
4750 if (need_dict) {
4751 PyObject *result = PyDict_New();
4752 PyObject *key, *value;
4753 if (!result)
4754 return NULL;
4755 for (i = 0; i < 256; i++) {
4756 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004757 key = PyLong_FromLong(decode[i]);
4758 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004759 if (!key || !value)
4760 goto failed1;
4761 if (PyDict_SetItem(result, key, value) == -1)
4762 goto failed1;
4763 Py_DECREF(key);
4764 Py_DECREF(value);
4765 }
4766 return result;
4767 failed1:
4768 Py_XDECREF(key);
4769 Py_XDECREF(value);
4770 Py_DECREF(result);
4771 return NULL;
4772 }
4773
4774 /* Create a three-level trie */
4775 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4776 16*count2 + 128*count3 - 1);
4777 if (!result)
4778 return PyErr_NoMemory();
4779 PyObject_Init(result, &EncodingMapType);
4780 mresult = (struct encoding_map*)result;
4781 mresult->count2 = count2;
4782 mresult->count3 = count3;
4783 mlevel1 = mresult->level1;
4784 mlevel2 = mresult->level23;
4785 mlevel3 = mresult->level23 + 16*count2;
4786 memcpy(mlevel1, level1, 32);
4787 memset(mlevel2, 0xFF, 16*count2);
4788 memset(mlevel3, 0, 128*count3);
4789 count3 = 0;
4790 for (i = 1; i < 256; i++) {
4791 int o1, o2, o3, i2, i3;
4792 if (decode[i] == 0xFFFE)
4793 /* unmapped character */
4794 continue;
4795 o1 = decode[i]>>11;
4796 o2 = (decode[i]>>7) & 0xF;
4797 i2 = 16*mlevel1[o1] + o2;
4798 if (mlevel2[i2] == 0xFF)
4799 mlevel2[i2] = count3++;
4800 o3 = decode[i] & 0x7F;
4801 i3 = 128*mlevel2[i2] + o3;
4802 mlevel3[i3] = i;
4803 }
4804 return result;
4805}
4806
4807static int
4808encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4809{
4810 struct encoding_map *map = (struct encoding_map*)mapping;
4811 int l1 = c>>11;
4812 int l2 = (c>>7) & 0xF;
4813 int l3 = c & 0x7F;
4814 int i;
4815
4816#ifdef Py_UNICODE_WIDE
4817 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004819 }
4820#endif
4821 if (c == 0)
4822 return 0;
4823 /* level 1*/
4824 i = map->level1[l1];
4825 if (i == 0xFF) {
4826 return -1;
4827 }
4828 /* level 2*/
4829 i = map->level23[16*i+l2];
4830 if (i == 0xFF) {
4831 return -1;
4832 }
4833 /* level 3 */
4834 i = map->level23[16*map->count2 + 128*i + l3];
4835 if (i == 0) {
4836 return -1;
4837 }
4838 return i;
4839}
4840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841/* Lookup the character ch in the mapping. If the character
4842 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004843 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Christian Heimes217cfd12007-12-02 14:31:20 +00004846 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 PyObject *x;
4848
4849 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 x = PyObject_GetItem(mapping, w);
4852 Py_DECREF(w);
4853 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4855 /* No mapping found means: mapping is undefined. */
4856 PyErr_Clear();
4857 x = Py_None;
4858 Py_INCREF(x);
4859 return x;
4860 } else
4861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004863 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004865 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 long value = PyLong_AS_LONG(x);
4867 if (value < 0 || value > 255) {
4868 PyErr_SetString(PyExc_TypeError,
4869 "character mapping must be in range(256)");
4870 Py_DECREF(x);
4871 return NULL;
4872 }
4873 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004875 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 /* wrong return value */
4879 PyErr_Format(PyExc_TypeError,
4880 "character mapping must return integer, bytes or None, not %.400s",
4881 x->ob_type->tp_name);
4882 Py_DECREF(x);
4883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 }
4885}
4886
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004887static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004888charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004889{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004890 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4891 /* exponentially overallocate to minimize reallocations */
4892 if (requiredsize < 2*outsize)
4893 requiredsize = 2*outsize;
4894 if (_PyBytes_Resize(outobj, requiredsize))
4895 return -1;
4896 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004897}
4898
Benjamin Peterson14339b62009-01-31 16:36:08 +00004899typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004901}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004903 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 space is available. Return a new reference to the object that
4905 was put in the output buffer, or Py_None, if the mapping was undefined
4906 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004907 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004909charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004912 PyObject *rep;
4913 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004914 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915
Christian Heimes90aa7642007-12-19 02:45:37 +00004916 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004917 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004919 if (res == -1)
4920 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 if (outsize<requiredsize)
4922 if (charmapencode_resize(outobj, outpos, requiredsize))
4923 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004924 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 outstart[(*outpos)++] = (char)res;
4926 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004927 }
4928
4929 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004932 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 Py_DECREF(rep);
4934 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004935 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 if (PyLong_Check(rep)) {
4937 Py_ssize_t requiredsize = *outpos+1;
4938 if (outsize<requiredsize)
4939 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4940 Py_DECREF(rep);
4941 return enc_EXCEPTION;
4942 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004943 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004945 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 else {
4947 const char *repchars = PyBytes_AS_STRING(rep);
4948 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4949 Py_ssize_t requiredsize = *outpos+repsize;
4950 if (outsize<requiredsize)
4951 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4952 Py_DECREF(rep);
4953 return enc_EXCEPTION;
4954 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004955 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 memcpy(outstart + *outpos, repchars, repsize);
4957 *outpos += repsize;
4958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004960 Py_DECREF(rep);
4961 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962}
4963
4964/* handle an error in PyUnicode_EncodeCharmap
4965 Return 0 on success, -1 on error */
4966static
4967int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004970 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004971 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972{
4973 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t repsize;
4975 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 Py_UNICODE *uni2;
4977 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004978 Py_ssize_t collstartpos = *inpos;
4979 Py_ssize_t collendpos = *inpos+1;
4980 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 char *encoding = "charmap";
4982 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004983 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 /* find all unencodable characters */
4986 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004987 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004988 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 int res = encoding_map_lookup(p[collendpos], mapping);
4990 if (res != -1)
4991 break;
4992 ++collendpos;
4993 continue;
4994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004995
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 rep = charmapencode_lookup(p[collendpos], mapping);
4997 if (rep==NULL)
4998 return -1;
4999 else if (rep!=Py_None) {
5000 Py_DECREF(rep);
5001 break;
5002 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005003 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 }
5006 /* cache callback name lookup
5007 * (if not done yet, i.e. it's the first error) */
5008 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if ((errors==NULL) || (!strcmp(errors, "strict")))
5010 *known_errorHandler = 1;
5011 else if (!strcmp(errors, "replace"))
5012 *known_errorHandler = 2;
5013 else if (!strcmp(errors, "ignore"))
5014 *known_errorHandler = 3;
5015 else if (!strcmp(errors, "xmlcharrefreplace"))
5016 *known_errorHandler = 4;
5017 else
5018 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 }
5020 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005021 case 1: /* strict */
5022 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5023 return -1;
5024 case 2: /* replace */
5025 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 x = charmapencode_output('?', mapping, res, respos);
5027 if (x==enc_EXCEPTION) {
5028 return -1;
5029 }
5030 else if (x==enc_FAILED) {
5031 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5032 return -1;
5033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005034 }
5035 /* fall through */
5036 case 3: /* ignore */
5037 *inpos = collendpos;
5038 break;
5039 case 4: /* xmlcharrefreplace */
5040 /* generate replacement (temporarily (mis)uses p) */
5041 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 char buffer[2+29+1+1];
5043 char *cp;
5044 sprintf(buffer, "&#%d;", (int)p[collpos]);
5045 for (cp = buffer; *cp; ++cp) {
5046 x = charmapencode_output(*cp, mapping, res, respos);
5047 if (x==enc_EXCEPTION)
5048 return -1;
5049 else if (x==enc_FAILED) {
5050 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5051 return -1;
5052 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005053 }
5054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005055 *inpos = collendpos;
5056 break;
5057 default:
5058 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 encoding, reason, p, size, exceptionObject,
5060 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005061 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 return -1;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005063 if (!PyUnicode_Check(repunicode)) {
5064 /* Implementation limitation: byte results not supported yet. */
5065 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5066 Py_DECREF(repunicode);
5067 return -1;
5068 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005069 /* generate replacement */
5070 repsize = PyUnicode_GET_SIZE(repunicode);
5071 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 x = charmapencode_output(*uni2, mapping, res, respos);
5073 if (x==enc_EXCEPTION) {
5074 return -1;
5075 }
5076 else if (x==enc_FAILED) {
5077 Py_DECREF(repunicode);
5078 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5079 return -1;
5080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005081 }
5082 *inpos = newpos;
5083 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 }
5085 return 0;
5086}
5087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 Py_ssize_t size,
5090 PyObject *mapping,
5091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 /* output object */
5094 PyObject *res = NULL;
5095 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005096 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005098 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 PyObject *errorHandler = NULL;
5100 PyObject *exc = NULL;
5101 /* the following variable is used for caching string comparisons
5102 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5103 * 3=ignore, 4=xmlcharrefreplace */
5104 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105
5106 /* Default to Latin-1 */
5107 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 /* allocate enough for a simple encoding without
5111 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005112 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 if (res == NULL)
5114 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005115 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 /* try to encode it */
5120 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5121 if (x==enc_EXCEPTION) /* error */
5122 goto onError;
5123 if (x==enc_FAILED) { /* unencodable character */
5124 if (charmap_encoding_error(p, size, &inpos, mapping,
5125 &exc,
5126 &known_errorHandler, &errorHandler, errors,
5127 &res, &respos)) {
5128 goto onError;
5129 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 else
5132 /* done with this character => adjust input position */
5133 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005137 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005138 if (_PyBytes_Resize(&res, respos) < 0)
5139 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 Py_XDECREF(exc);
5142 Py_XDECREF(errorHandler);
5143 return res;
5144
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 Py_XDECREF(res);
5147 Py_XDECREF(exc);
5148 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 return NULL;
5150}
5151
5152PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154{
5155 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 PyErr_BadArgument();
5157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 }
5159 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 PyUnicode_GET_SIZE(unicode),
5161 mapping,
5162 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163}
5164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165/* create or adjust a UnicodeTranslateError */
5166static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 const Py_UNICODE *unicode, Py_ssize_t size,
5168 Py_ssize_t startpos, Py_ssize_t endpos,
5169 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005172 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 }
5175 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5177 goto onError;
5178 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5179 goto onError;
5180 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5181 goto onError;
5182 return;
5183 onError:
5184 Py_DECREF(*exceptionObject);
5185 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 }
5187}
5188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189/* raises a UnicodeTranslateError */
5190static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 const Py_UNICODE *unicode, Py_ssize_t size,
5192 Py_ssize_t startpos, Py_ssize_t endpos,
5193 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194{
5195 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005198 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199}
5200
5201/* error handling callback helper:
5202 build arguments, call the callback and check the arguments,
5203 put the result into newpos and return the replacement string, which
5204 has to be freed by the caller */
5205static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 PyObject **errorHandler,
5207 const char *reason,
5208 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5209 Py_ssize_t startpos, Py_ssize_t endpos,
5210 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005212 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005214 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 PyObject *restuple;
5216 PyObject *resunicode;
5217
5218 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 }
5223
5224 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228
5229 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005234 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 Py_DECREF(restuple);
5236 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 }
5238 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 &resunicode, &i_newpos)) {
5240 Py_DECREF(restuple);
5241 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 else
5246 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005247 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5249 Py_DECREF(restuple);
5250 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 Py_INCREF(resunicode);
5253 Py_DECREF(restuple);
5254 return resunicode;
5255}
5256
5257/* Lookup the character ch in the mapping and put the result in result,
5258 which must be decrefed by the caller.
5259 Return 0 on success, -1 on error */
5260static
5261int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5262{
Christian Heimes217cfd12007-12-02 14:31:20 +00005263 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 PyObject *x;
5265
5266 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 x = PyObject_GetItem(mapping, w);
5269 Py_DECREF(w);
5270 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5272 /* No mapping found means: use 1:1 mapping. */
5273 PyErr_Clear();
5274 *result = NULL;
5275 return 0;
5276 } else
5277 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005278 }
5279 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 *result = x;
5281 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005283 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 long value = PyLong_AS_LONG(x);
5285 long max = PyUnicode_GetMax();
5286 if (value < 0 || value > max) {
5287 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005288 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 Py_DECREF(x);
5290 return -1;
5291 }
5292 *result = x;
5293 return 0;
5294 }
5295 else if (PyUnicode_Check(x)) {
5296 *result = x;
5297 return 0;
5298 }
5299 else {
5300 /* wrong return value */
5301 PyErr_SetString(PyExc_TypeError,
5302 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005303 Py_DECREF(x);
5304 return -1;
5305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005306}
5307/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 if not reallocate and adjust various state variables.
5309 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310static
Walter Dörwald4894c302003-10-24 14:25:28 +00005311int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005314 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005315 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 /* remember old output position */
5317 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5318 /* exponentially overallocate to minimize reallocations */
5319 if (requiredsize < 2 * oldsize)
5320 requiredsize = 2 * oldsize;
5321 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5322 return -1;
5323 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 }
5325 return 0;
5326}
5327/* lookup the character, put the result in the output string and adjust
5328 various state variables. Return a new reference to the object that
5329 was put in the output buffer in *result, or Py_None, if the mapping was
5330 undefined (in which case no character was written).
5331 The called must decref result.
5332 Return 0 on success, -1 on error. */
5333static
Walter Dörwald4894c302003-10-24 14:25:28 +00005334int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5336 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337{
Walter Dörwald4894c302003-10-24 14:25:28 +00005338 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 /* not found => default to 1:1 mapping */
5342 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005343 }
5344 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005346 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 /* no overflow check, because we know that the space is enough */
5348 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 }
5350 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5352 if (repsize==1) {
5353 /* no overflow check, because we know that the space is enough */
5354 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5355 }
5356 else if (repsize!=0) {
5357 /* more than one character */
5358 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5359 (insize - (curinp-startinp)) +
5360 repsize - 1;
5361 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5362 return -1;
5363 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5364 *outp += repsize;
5365 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 }
5367 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 return 0;
5370}
5371
5372PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 Py_ssize_t size,
5374 PyObject *mapping,
5375 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 /* output object */
5378 PyObject *res = NULL;
5379 /* pointers to the beginning and end+1 of input */
5380 const Py_UNICODE *startp = p;
5381 const Py_UNICODE *endp = p + size;
5382 /* pointer into the output */
5383 Py_UNICODE *str;
5384 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 char *reason = "character maps to <undefined>";
5387 PyObject *errorHandler = NULL;
5388 PyObject *exc = NULL;
5389 /* the following variable is used for caching string comparisons
5390 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5391 * 3=ignore, 4=xmlcharrefreplace */
5392 int known_errorHandler = -1;
5393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 PyErr_BadArgument();
5396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398
5399 /* allocate enough for a simple 1:1 translation without
5400 replacements, if we need more, we'll resize */
5401 res = PyUnicode_FromUnicode(NULL, size);
5402 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 /* try to encode it */
5410 PyObject *x = NULL;
5411 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5412 Py_XDECREF(x);
5413 goto onError;
5414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 if (x!=Py_None) /* it worked => adjust input pointer */
5417 ++p;
5418 else { /* untranslatable character */
5419 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5420 Py_ssize_t repsize;
5421 Py_ssize_t newpos;
5422 Py_UNICODE *uni2;
5423 /* startpos for collecting untranslatable chars */
5424 const Py_UNICODE *collstart = p;
5425 const Py_UNICODE *collend = p+1;
5426 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 /* find all untranslatable characters */
5429 while (collend < endp) {
5430 if (charmaptranslate_lookup(*collend, mapping, &x))
5431 goto onError;
5432 Py_XDECREF(x);
5433 if (x!=Py_None)
5434 break;
5435 ++collend;
5436 }
5437 /* cache callback name lookup
5438 * (if not done yet, i.e. it's the first error) */
5439 if (known_errorHandler==-1) {
5440 if ((errors==NULL) || (!strcmp(errors, "strict")))
5441 known_errorHandler = 1;
5442 else if (!strcmp(errors, "replace"))
5443 known_errorHandler = 2;
5444 else if (!strcmp(errors, "ignore"))
5445 known_errorHandler = 3;
5446 else if (!strcmp(errors, "xmlcharrefreplace"))
5447 known_errorHandler = 4;
5448 else
5449 known_errorHandler = 0;
5450 }
5451 switch (known_errorHandler) {
5452 case 1: /* strict */
5453 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005454 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 case 2: /* replace */
5456 /* No need to check for space, this is a 1:1 replacement */
5457 for (coll = collstart; coll<collend; ++coll)
5458 *str++ = '?';
5459 /* fall through */
5460 case 3: /* ignore */
5461 p = collend;
5462 break;
5463 case 4: /* xmlcharrefreplace */
5464 /* generate replacement (temporarily (mis)uses p) */
5465 for (p = collstart; p < collend; ++p) {
5466 char buffer[2+29+1+1];
5467 char *cp;
5468 sprintf(buffer, "&#%d;", (int)*p);
5469 if (charmaptranslate_makespace(&res, &str,
5470 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5471 goto onError;
5472 for (cp = buffer; *cp; ++cp)
5473 *str++ = *cp;
5474 }
5475 p = collend;
5476 break;
5477 default:
5478 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5479 reason, startp, size, &exc,
5480 collstart-startp, collend-startp, &newpos);
5481 if (repunicode == NULL)
5482 goto onError;
5483 /* generate replacement */
5484 repsize = PyUnicode_GET_SIZE(repunicode);
5485 if (charmaptranslate_makespace(&res, &str,
5486 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5487 Py_DECREF(repunicode);
5488 goto onError;
5489 }
5490 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5491 *str++ = *uni2;
5492 p = startp + newpos;
5493 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005495 }
5496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 /* Resize if we allocated to much */
5498 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005499 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 if (PyUnicode_Resize(&res, respos) < 0)
5501 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 }
5503 Py_XDECREF(exc);
5504 Py_XDECREF(errorHandler);
5505 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 Py_XDECREF(res);
5509 Py_XDECREF(exc);
5510 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 return NULL;
5512}
5513
5514PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 PyObject *mapping,
5516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517{
5518 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 str = PyUnicode_FromObject(str);
5521 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 PyUnicode_GET_SIZE(str),
5525 mapping,
5526 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 Py_DECREF(str);
5528 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005529
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 Py_XDECREF(str);
5532 return NULL;
5533}
Tim Petersced69f82003-09-16 20:30:58 +00005534
Guido van Rossum9e896b32000-04-05 20:11:21 +00005535/* --- Decimal Encoder ---------------------------------------------------- */
5536
5537int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 Py_ssize_t length,
5539 char *output,
5540 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005541{
5542 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 PyObject *errorHandler = NULL;
5544 PyObject *exc = NULL;
5545 const char *encoding = "decimal";
5546 const char *reason = "invalid decimal Unicode string";
5547 /* the following variable is used for caching string comparisons
5548 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5549 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005550
5551 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 PyErr_BadArgument();
5553 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005554 }
5555
5556 p = s;
5557 end = s + length;
5558 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 register Py_UNICODE ch = *p;
5560 int decimal;
5561 PyObject *repunicode;
5562 Py_ssize_t repsize;
5563 Py_ssize_t newpos;
5564 Py_UNICODE *uni2;
5565 Py_UNICODE *collstart;
5566 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005567
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005569 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 ++p;
5571 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 decimal = Py_UNICODE_TODECIMAL(ch);
5574 if (decimal >= 0) {
5575 *output++ = '0' + decimal;
5576 ++p;
5577 continue;
5578 }
5579 if (0 < ch && ch < 256) {
5580 *output++ = (char)ch;
5581 ++p;
5582 continue;
5583 }
5584 /* All other characters are considered unencodable */
5585 collstart = p;
5586 collend = p+1;
5587 while (collend < end) {
5588 if ((0 < *collend && *collend < 256) ||
5589 !Py_UNICODE_ISSPACE(*collend) ||
5590 Py_UNICODE_TODECIMAL(*collend))
5591 break;
5592 }
5593 /* cache callback name lookup
5594 * (if not done yet, i.e. it's the first error) */
5595 if (known_errorHandler==-1) {
5596 if ((errors==NULL) || (!strcmp(errors, "strict")))
5597 known_errorHandler = 1;
5598 else if (!strcmp(errors, "replace"))
5599 known_errorHandler = 2;
5600 else if (!strcmp(errors, "ignore"))
5601 known_errorHandler = 3;
5602 else if (!strcmp(errors, "xmlcharrefreplace"))
5603 known_errorHandler = 4;
5604 else
5605 known_errorHandler = 0;
5606 }
5607 switch (known_errorHandler) {
5608 case 1: /* strict */
5609 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5610 goto onError;
5611 case 2: /* replace */
5612 for (p = collstart; p < collend; ++p)
5613 *output++ = '?';
5614 /* fall through */
5615 case 3: /* ignore */
5616 p = collend;
5617 break;
5618 case 4: /* xmlcharrefreplace */
5619 /* generate replacement (temporarily (mis)uses p) */
5620 for (p = collstart; p < collend; ++p)
5621 output += sprintf(output, "&#%d;", (int)*p);
5622 p = collend;
5623 break;
5624 default:
5625 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5626 encoding, reason, s, length, &exc,
5627 collstart-s, collend-s, &newpos);
5628 if (repunicode == NULL)
5629 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005630 if (!PyUnicode_Check(repunicode)) {
5631 /* Implementation limitation: byte results not supported yet. */
5632 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5633 Py_DECREF(repunicode);
5634 goto onError;
5635 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 /* generate replacement */
5637 repsize = PyUnicode_GET_SIZE(repunicode);
5638 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5639 Py_UNICODE ch = *uni2;
5640 if (Py_UNICODE_ISSPACE(ch))
5641 *output++ = ' ';
5642 else {
5643 decimal = Py_UNICODE_TODECIMAL(ch);
5644 if (decimal >= 0)
5645 *output++ = '0' + decimal;
5646 else if (0 < ch && ch < 256)
5647 *output++ = (char)ch;
5648 else {
5649 Py_DECREF(repunicode);
5650 raise_encode_exception(&exc, encoding,
5651 s, length, collstart-s, collend-s, reason);
5652 goto onError;
5653 }
5654 }
5655 }
5656 p = s + newpos;
5657 Py_DECREF(repunicode);
5658 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005659 }
5660 /* 0-terminate the output string */
5661 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 Py_XDECREF(exc);
5663 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005664 return 0;
5665
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 Py_XDECREF(exc);
5668 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005669 return -1;
5670}
5671
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672/* --- Helpers ------------------------------------------------------------ */
5673
Eric Smith8c663262007-08-25 02:26:07 +00005674#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005675#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005676#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005677/* Include _ParseTupleFinds from find.h */
5678#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005679#include "stringlib/find.h"
5680#include "stringlib/partition.h"
5681
Eric Smith5807c412008-05-11 21:00:57 +00005682#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005683#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005684#include "stringlib/localeutil.h"
5685
Thomas Wouters477c8d52006-05-27 19:21:47 +00005686/* helper macro to fixup start/end slice values */
5687#define FIX_START_END(obj) \
5688 if (start < 0) \
5689 start += (obj)->length; \
5690 if (start < 0) \
5691 start = 0; \
5692 if (end > (obj)->length) \
5693 end = (obj)->length; \
5694 if (end < 0) \
5695 end += (obj)->length; \
5696 if (end < 0) \
5697 end = 0;
5698
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005700 PyObject *substr,
5701 Py_ssize_t start,
5702 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005704 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005705 PyUnicodeObject* str_obj;
5706 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005707
Thomas Wouters477c8d52006-05-27 19:21:47 +00005708 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5709 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005711 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5712 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 Py_DECREF(str_obj);
5714 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 }
Tim Petersced69f82003-09-16 20:30:58 +00005716
Thomas Wouters477c8d52006-05-27 19:21:47 +00005717 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005718
Thomas Wouters477c8d52006-05-27 19:21:47 +00005719 result = stringlib_count(
5720 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5721 );
5722
5723 Py_DECREF(sub_obj);
5724 Py_DECREF(str_obj);
5725
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 return result;
5727}
5728
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005730 PyObject *sub,
5731 Py_ssize_t start,
5732 Py_ssize_t end,
5733 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005738 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005740 sub = PyUnicode_FromObject(sub);
5741 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 Py_DECREF(str);
5743 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 }
Tim Petersced69f82003-09-16 20:30:58 +00005745
Thomas Wouters477c8d52006-05-27 19:21:47 +00005746 if (direction > 0)
5747 result = stringlib_find_slice(
5748 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5749 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5750 start, end
5751 );
5752 else
5753 result = stringlib_rfind_slice(
5754 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5755 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5756 start, end
5757 );
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005760 Py_DECREF(sub);
5761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 return result;
5763}
5764
Tim Petersced69f82003-09-16 20:30:58 +00005765static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 PyUnicodeObject *substring,
5768 Py_ssize_t start,
5769 Py_ssize_t end,
5770 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 if (substring->length == 0)
5773 return 1;
5774
Thomas Wouters477c8d52006-05-27 19:21:47 +00005775 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
5777 end -= substring->length;
5778 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780
5781 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 if (Py_UNICODE_MATCH(self, end, substring))
5783 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 } else {
5785 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 }
5788
5789 return 0;
5790}
5791
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 PyObject *substr,
5794 Py_ssize_t start,
5795 Py_ssize_t end,
5796 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005799
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 str = PyUnicode_FromObject(str);
5801 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 substr = PyUnicode_FromObject(substr);
5804 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 Py_DECREF(str);
5806 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 }
Tim Petersced69f82003-09-16 20:30:58 +00005808
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 (PyUnicodeObject *)substr,
5811 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 Py_DECREF(str);
5813 Py_DECREF(substr);
5814 return result;
5815}
5816
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817/* Apply fixfct filter to the Unicode object self and return a
5818 reference to the modified object */
5819
Tim Petersced69f82003-09-16 20:30:58 +00005820static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
5824
5825 PyUnicodeObject *u;
5826
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005827 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005830
5831 Py_UNICODE_COPY(u->str, self->str, self->length);
5832
Tim Peters7a29bd52001-09-12 03:03:31 +00005833 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 /* fixfct should return TRUE if it modified the buffer. If
5835 FALSE, return a reference to the original buffer instead
5836 (to save space, not time) */
5837 Py_INCREF(self);
5838 Py_DECREF(u);
5839 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
5841 return (PyObject*) u;
5842}
5843
Tim Petersced69f82003-09-16 20:30:58 +00005844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845int fixupper(PyUnicodeObject *self)
5846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 Py_UNICODE *s = self->str;
5849 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005853
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 ch = Py_UNICODE_TOUPPER(*s);
5855 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 *s = ch;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 s++;
5860 }
5861
5862 return status;
5863}
5864
Tim Petersced69f82003-09-16 20:30:58 +00005865static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866int fixlower(PyUnicodeObject *self)
5867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 Py_UNICODE *s = self->str;
5870 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005874
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 ch = Py_UNICODE_TOLOWER(*s);
5876 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 *s = ch;
5879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 s++;
5881 }
5882
5883 return status;
5884}
5885
Tim Petersced69f82003-09-16 20:30:58 +00005886static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887int fixswapcase(PyUnicodeObject *self)
5888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005889 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 Py_UNICODE *s = self->str;
5891 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 while (len-- > 0) {
5894 if (Py_UNICODE_ISUPPER(*s)) {
5895 *s = Py_UNICODE_TOLOWER(*s);
5896 status = 1;
5897 } else if (Py_UNICODE_ISLOWER(*s)) {
5898 *s = Py_UNICODE_TOUPPER(*s);
5899 status = 1;
5900 }
5901 s++;
5902 }
5903
5904 return status;
5905}
5906
Tim Petersced69f82003-09-16 20:30:58 +00005907static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908int fixcapitalize(PyUnicodeObject *self)
5909{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005911 Py_UNICODE *s = self->str;
5912 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005913
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005914 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005916 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 *s = Py_UNICODE_TOUPPER(*s);
5918 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005920 s++;
5921 while (--len > 0) {
5922 if (Py_UNICODE_ISUPPER(*s)) {
5923 *s = Py_UNICODE_TOLOWER(*s);
5924 status = 1;
5925 }
5926 s++;
5927 }
5928 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
5931static
5932int fixtitle(PyUnicodeObject *self)
5933{
5934 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5935 register Py_UNICODE *e;
5936 int previous_is_cased;
5937
5938 /* Shortcut for single character strings */
5939 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5941 if (*p != ch) {
5942 *p = ch;
5943 return 1;
5944 }
5945 else
5946 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 }
Tim Petersced69f82003-09-16 20:30:58 +00005948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 e = p + PyUnicode_GET_SIZE(self);
5950 previous_is_cased = 0;
5951 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005953
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 if (previous_is_cased)
5955 *p = Py_UNICODE_TOLOWER(ch);
5956 else
5957 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005958
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 if (Py_UNICODE_ISLOWER(ch) ||
5960 Py_UNICODE_ISUPPER(ch) ||
5961 Py_UNICODE_ISTITLE(ch))
5962 previous_is_cased = 1;
5963 else
5964 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 }
5966 return 1;
5967}
5968
Tim Peters8ce9f162004-08-27 01:49:32 +00005969PyObject *
5970PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971{
Skip Montanaro6543b452004-09-16 03:28:13 +00005972 const Py_UNICODE blank = ' ';
5973 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005975 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005976 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5977 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005978 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5979 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005980 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005981 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Tim Peters05eba1f2004-08-27 21:32:02 +00005983 fseq = PySequence_Fast(seq, "");
5984 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005985 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005986 }
5987
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005988 /* NOTE: the following code can't call back into Python code,
5989 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005990 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005991
Tim Peters05eba1f2004-08-27 21:32:02 +00005992 seqlen = PySequence_Fast_GET_SIZE(fseq);
5993 /* If empty sequence, return u"". */
5994 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005995 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5996 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005997 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005998 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005999 /* If singleton sequence with an exact Unicode, return that. */
6000 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 item = items[0];
6002 if (PyUnicode_CheckExact(item)) {
6003 Py_INCREF(item);
6004 res = (PyUnicodeObject *)item;
6005 goto Done;
6006 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006007 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006008 else {
6009 /* Set up sep and seplen */
6010 if (separator == NULL) {
6011 sep = &blank;
6012 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006013 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006014 else {
6015 if (!PyUnicode_Check(separator)) {
6016 PyErr_Format(PyExc_TypeError,
6017 "separator: expected str instance,"
6018 " %.80s found",
6019 Py_TYPE(separator)->tp_name);
6020 goto onError;
6021 }
6022 sep = PyUnicode_AS_UNICODE(separator);
6023 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006024 }
6025 }
6026
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006027 /* There are at least two things to join, or else we have a subclass
6028 * of str in the sequence.
6029 * Do a pre-pass to figure out the total amount of space we'll
6030 * need (sz), and see whether all argument are strings.
6031 */
6032 sz = 0;
6033 for (i = 0; i < seqlen; i++) {
6034 const Py_ssize_t old_sz = sz;
6035 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 if (!PyUnicode_Check(item)) {
6037 PyErr_Format(PyExc_TypeError,
6038 "sequence item %zd: expected str instance,"
6039 " %.80s found",
6040 i, Py_TYPE(item)->tp_name);
6041 goto onError;
6042 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006043 sz += PyUnicode_GET_SIZE(item);
6044 if (i != 0)
6045 sz += seplen;
6046 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6047 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006049 goto onError;
6050 }
6051 }
Tim Petersced69f82003-09-16 20:30:58 +00006052
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006053 res = _PyUnicode_New(sz);
6054 if (res == NULL)
6055 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006056
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006057 /* Catenate everything. */
6058 res_p = PyUnicode_AS_UNICODE(res);
6059 for (i = 0; i < seqlen; ++i) {
6060 Py_ssize_t itemlen;
6061 item = items[i];
6062 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* Copy item, and maybe the separator. */
6064 if (i) {
6065 Py_UNICODE_COPY(res_p, sep, seplen);
6066 res_p += seplen;
6067 }
6068 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6069 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006070 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006071
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006073 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 return (PyObject *)res;
6075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006077 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006078 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return NULL;
6080}
6081
Tim Petersced69f82003-09-16 20:30:58 +00006082static
6083PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 Py_ssize_t left,
6085 Py_ssize_t right,
6086 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087{
6088 PyUnicodeObject *u;
6089
6090 if (left < 0)
6091 left = 0;
6092 if (right < 0)
6093 right = 0;
6094
Tim Peters7a29bd52001-09-12 03:03:31 +00006095 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 Py_INCREF(self);
6097 return self;
6098 }
6099
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006100 if (left > PY_SSIZE_T_MAX - self->length ||
6101 right > PY_SSIZE_T_MAX - (left + self->length)) {
6102 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6103 return NULL;
6104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 u = _PyUnicode_New(left + self->length + right);
6106 if (u) {
6107 if (left)
6108 Py_UNICODE_FILL(u->str, fill, left);
6109 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6110 if (right)
6111 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6112 }
6113
6114 return u;
6115}
6116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117#define SPLIT_APPEND(data, left, right) \
6118 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6119 if (!str) \
6120 goto onError; \
6121 if (PyList_Append(list, str)) { \
6122 Py_DECREF(str); \
6123 goto onError; \
6124 } \
6125 else \
6126 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
6128static
6129PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 PyObject *list,
6131 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006133 register Py_ssize_t i;
6134 register Py_ssize_t j;
6135 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006137 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
6139 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006141 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006143 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6145 i++;
6146 if (j < i) {
6147 if (maxcount-- <= 0)
6148 break;
6149 SPLIT_APPEND(buf, j, i);
6150 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6151 i++;
6152 j = i;
6153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
6155 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 }
6158 return list;
6159
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 Py_DECREF(list);
6162 return NULL;
6163}
6164
6165PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 register Py_ssize_t i;
6169 register Py_ssize_t j;
6170 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 PyObject *list;
6172 PyObject *str;
6173 Py_UNICODE *data;
6174
6175 string = PyUnicode_FromObject(string);
6176 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 data = PyUnicode_AS_UNICODE(string);
6179 len = PyUnicode_GET_SIZE(string);
6180
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 list = PyList_New(0);
6182 if (!list)
6183 goto onError;
6184
6185 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006187
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 /* Find a line and append it */
6189 while (i < len && !BLOOM_LINEBREAK(data[i]))
6190 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006193 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 if (i < len) {
6195 if (data[i] == '\r' && i + 1 < len &&
6196 data[i+1] == '\n')
6197 i += 2;
6198 else
6199 i++;
6200 if (keepends)
6201 eol = i;
6202 }
6203 SPLIT_APPEND(data, j, eol);
6204 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
6206 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 }
6209
6210 Py_DECREF(string);
6211 return list;
6212
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006214 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 Py_DECREF(string);
6216 return NULL;
6217}
6218
Tim Petersced69f82003-09-16 20:30:58 +00006219static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 PyObject *list,
6222 Py_UNICODE ch,
6223 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006225 register Py_ssize_t i;
6226 register Py_ssize_t j;
6227 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006229 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230
6231 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 if (buf[i] == ch) {
6233 if (maxcount-- <= 0)
6234 break;
6235 SPLIT_APPEND(buf, j, i);
6236 i = j = i + 1;
6237 } else
6238 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
6240 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 }
6243 return list;
6244
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 Py_DECREF(list);
6247 return NULL;
6248}
6249
Tim Petersced69f82003-09-16 20:30:58 +00006250static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 PyObject *list,
6253 PyUnicodeObject *substring,
6254 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006256 register Py_ssize_t i;
6257 register Py_ssize_t j;
6258 Py_ssize_t len = self->length;
6259 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 PyObject *str;
6261
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006262 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 if (Py_UNICODE_MATCH(self, i, substring)) {
6264 if (maxcount-- <= 0)
6265 break;
6266 SPLIT_APPEND(self->str, j, i);
6267 i = j = i + sublen;
6268 } else
6269 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
6271 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 }
6274 return list;
6275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 Py_DECREF(list);
6278 return NULL;
6279}
6280
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006281static
6282PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 PyObject *list,
6284 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006286 register Py_ssize_t i;
6287 register Py_ssize_t j;
6288 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006289 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006290 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006291
6292 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006294 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006296 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6298 i--;
6299 if (j > i) {
6300 if (maxcount-- <= 0)
6301 break;
6302 SPLIT_APPEND(buf, i + 1, j + 1);
6303 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6304 i--;
6305 j = i;
6306 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006307 }
6308 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006310 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006311 if (PyList_Reverse(list) < 0)
6312 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006313 return list;
6314
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006316 Py_DECREF(list);
6317 return NULL;
6318}
6319
Benjamin Peterson14339b62009-01-31 16:36:08 +00006320static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006321PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 PyObject *list,
6323 Py_UNICODE ch,
6324 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006326 register Py_ssize_t i;
6327 register Py_ssize_t j;
6328 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006329 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006330 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006331
6332 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 if (buf[i] == ch) {
6334 if (maxcount-- <= 0)
6335 break;
6336 SPLIT_APPEND(buf, i + 1, j + 1);
6337 j = i = i - 1;
6338 } else
6339 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006340 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006341 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006343 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006344 if (PyList_Reverse(list) < 0)
6345 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006346 return list;
6347
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006349 Py_DECREF(list);
6350 return NULL;
6351}
6352
Benjamin Peterson14339b62009-01-31 16:36:08 +00006353static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006354PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 PyObject *list,
6356 PyUnicodeObject *substring,
6357 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 register Py_ssize_t i;
6360 register Py_ssize_t j;
6361 Py_ssize_t len = self->length;
6362 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006363 PyObject *str;
6364
6365 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 if (Py_UNICODE_MATCH(self, i, substring)) {
6367 if (maxcount-- <= 0)
6368 break;
6369 SPLIT_APPEND(self->str, i + sublen, j);
6370 j = i;
6371 i -= sublen;
6372 } else
6373 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006374 }
6375 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 if (PyList_Reverse(list) < 0)
6379 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006380 return list;
6381
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006383 Py_DECREF(list);
6384 return NULL;
6385}
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387#undef SPLIT_APPEND
6388
6389static
6390PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 PyUnicodeObject *substring,
6392 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 PyObject *list;
6395
6396 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006397 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
6399 list = PyList_New(0);
6400 if (!list)
6401 return NULL;
6402
6403 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 Py_DECREF(list);
6411 PyErr_SetString(PyExc_ValueError, "empty separator");
6412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
6414 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
Tim Petersced69f82003-09-16 20:30:58 +00006418static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006419PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 PyUnicodeObject *substring,
6421 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422{
6423 PyObject *list;
6424
6425 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006426 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006427
6428 list = PyList_New(0);
6429 if (!list)
6430 return NULL;
6431
6432 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006434
6435 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006437
6438 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 Py_DECREF(list);
6440 PyErr_SetString(PyExc_ValueError, "empty separator");
6441 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006442 }
6443 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006445}
6446
6447static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 PyUnicodeObject *str1,
6450 PyUnicodeObject *str2,
6451 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452{
6453 PyUnicodeObject *u;
6454
6455 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457
Thomas Wouters477c8d52006-05-27 19:21:47 +00006458 if (str1->length == str2->length) {
6459 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006460 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 if (str1->length == 1) {
6462 /* replace characters */
6463 Py_UNICODE u1, u2;
6464 if (!findchar(self->str, self->length, str1->str[0]))
6465 goto nothing;
6466 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6467 if (!u)
6468 return NULL;
6469 Py_UNICODE_COPY(u->str, self->str, self->length);
6470 u1 = str1->str[0];
6471 u2 = str2->str[0];
6472 for (i = 0; i < u->length; i++)
6473 if (u->str[i] == u1) {
6474 if (--maxcount < 0)
6475 break;
6476 u->str[i] = u2;
6477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479 i = fastsearch(
6480 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482 if (i < 0)
6483 goto nothing;
6484 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6485 if (!u)
6486 return NULL;
6487 Py_UNICODE_COPY(u->str, self->str, self->length);
6488 while (i <= self->length - str1->length)
6489 if (Py_UNICODE_MATCH(self, i, str1)) {
6490 if (--maxcount < 0)
6491 break;
6492 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6493 i += str1->length;
6494 } else
6495 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006498
6499 Py_ssize_t n, i, j, e;
6500 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 Py_UNICODE *p;
6502
6503 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006504 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 if (n > maxcount)
6506 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 if (n == 0)
6508 goto nothing;
6509 /* new_size = self->length + n * (str2->length - str1->length)); */
6510 delta = (str2->length - str1->length);
6511 if (delta == 0) {
6512 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514 product = n * (str2->length - str1->length);
6515 if ((product / (str2->length - str1->length)) != n) {
6516 PyErr_SetString(PyExc_OverflowError,
6517 "replace string is too long");
6518 return NULL;
6519 }
6520 new_size = self->length + product;
6521 if (new_size < 0) {
6522 PyErr_SetString(PyExc_OverflowError,
6523 "replace string is too long");
6524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 }
6526 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527 u = _PyUnicode_New(new_size);
6528 if (!u)
6529 return NULL;
6530 i = 0;
6531 p = u->str;
6532 e = self->length - str1->length;
6533 if (str1->length > 0) {
6534 while (n-- > 0) {
6535 /* look for next match */
6536 j = i;
6537 while (j <= e) {
6538 if (Py_UNICODE_MATCH(self, j, str1))
6539 break;
6540 j++;
6541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 if (j > e)
6544 break;
6545 /* copy unchanged part [i:j] */
6546 Py_UNICODE_COPY(p, self->str+i, j-i);
6547 p += j - i;
6548 }
6549 /* copy substitution string */
6550 if (str2->length > 0) {
6551 Py_UNICODE_COPY(p, str2->str, str2->length);
6552 p += str2->length;
6553 }
6554 i = j + str1->length;
6555 }
6556 if (i < self->length)
6557 /* copy tail [i:] */
6558 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6559 } else {
6560 /* interleave */
6561 while (n > 0) {
6562 Py_UNICODE_COPY(p, str2->str, str2->length);
6563 p += str2->length;
6564 if (--n <= 0)
6565 break;
6566 *p++ = self->str[i++];
6567 }
6568 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006574 /* nothing to replace; return original string (when possible) */
6575 if (PyUnicode_CheckExact(self)) {
6576 Py_INCREF(self);
6577 return (PyObject *) self;
6578 }
6579 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
6582/* --- Unicode Object Methods --------------------------------------------- */
6583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586\n\
6587Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006591unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 return fixup(self, fixtitle);
6594}
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598\n\
6599Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return fixup(self, fixcapitalize);
6606}
6607
6608#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006609PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611\n\
6612Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006613normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614
6615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006616unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
6618 PyObject *list;
6619 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006620 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 /* Split into words */
6623 list = split(self, NULL, -1);
6624 if (!list)
6625 return NULL;
6626
6627 /* Capitalize each word */
6628 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6629 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 if (item == NULL)
6632 goto onError;
6633 Py_DECREF(PyList_GET_ITEM(list, i));
6634 PyList_SET_ITEM(list, i, item);
6635 }
6636
6637 /* Join the words to form a new string */
6638 item = PyUnicode_Join(NULL, list);
6639
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 Py_DECREF(list);
6642 return (PyObject *)item;
6643}
6644#endif
6645
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006646/* Argument converter. Coerces to a single unicode character */
6647
6648static int
6649convert_uc(PyObject *obj, void *addr)
6650{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006651 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6652 PyObject *uniobj;
6653 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006654
Benjamin Peterson14339b62009-01-31 16:36:08 +00006655 uniobj = PyUnicode_FromObject(obj);
6656 if (uniobj == NULL) {
6657 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006659 return 0;
6660 }
6661 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6662 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006664 Py_DECREF(uniobj);
6665 return 0;
6666 }
6667 unistr = PyUnicode_AS_UNICODE(uniobj);
6668 *fillcharloc = unistr[0];
6669 Py_DECREF(uniobj);
6670 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006671}
6672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006676Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006677done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
6679static PyObject *
6680unicode_center(PyUnicodeObject *self, PyObject *args)
6681{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006682 Py_ssize_t marg, left;
6683 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006684 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
Thomas Woutersde017742006-02-16 19:34:37 +00006686 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 return NULL;
6688
Tim Peters7a29bd52001-09-12 03:03:31 +00006689 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 Py_INCREF(self);
6691 return (PyObject*) self;
6692 }
6693
6694 marg = width - self->length;
6695 left = marg / 2 + (marg & width & 1);
6696
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006697 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698}
6699
Marc-André Lemburge5034372000-08-08 08:04:29 +00006700#if 0
6701
6702/* This code should go into some future Unicode collation support
6703 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006704 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006705
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006706/* speedy UTF-16 code point order comparison */
6707/* gleaned from: */
6708/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6709
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006710static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006711{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006712 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006713 0, 0, 0, 0, 0, 0, 0, 0,
6714 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006715 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006716};
6717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718static int
6719unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006721 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006722
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 Py_UNICODE *s1 = str1->str;
6724 Py_UNICODE *s2 = str2->str;
6725
6726 len1 = str1->length;
6727 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006730 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006731
6732 c1 = *s1++;
6733 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006734
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 if (c1 > (1<<11) * 26)
6736 c1 += utf16Fixup[c1>>11];
6737 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006738 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006739 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006740
6741 if (c1 != c2)
6742 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006743
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006744 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 }
6746
6747 return (len1 < len2) ? -1 : (len1 != len2);
6748}
6749
Marc-André Lemburge5034372000-08-08 08:04:29 +00006750#else
6751
6752static int
6753unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6754{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006756
6757 Py_UNICODE *s1 = str1->str;
6758 Py_UNICODE *s2 = str2->str;
6759
6760 len1 = str1->length;
6761 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006762
Marc-André Lemburge5034372000-08-08 08:04:29 +00006763 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006764 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006765
Fredrik Lundh45714e92001-06-26 16:39:36 +00006766 c1 = *s1++;
6767 c2 = *s2++;
6768
6769 if (c1 != c2)
6770 return (c1 < c2) ? -1 : 1;
6771
Marc-André Lemburge5034372000-08-08 08:04:29 +00006772 len1--; len2--;
6773 }
6774
6775 return (len1 < len2) ? -1 : (len1 != len2);
6776}
6777
6778#endif
6779
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006783 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6784 return unicode_compare((PyUnicodeObject *)left,
6785 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006786 PyErr_Format(PyExc_TypeError,
6787 "Can't compare %.100s and %.100s",
6788 left->ob_type->tp_name,
6789 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 return -1;
6791}
6792
Martin v. Löwis5b222132007-06-10 09:51:05 +00006793int
6794PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6795{
6796 int i;
6797 Py_UNICODE *id;
6798 assert(PyUnicode_Check(uni));
6799 id = PyUnicode_AS_UNICODE(uni);
6800 /* Compare Unicode string and source character set string */
6801 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 if (id[i] != str[i])
6803 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006804 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006806 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006808 return 0;
6809}
6810
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006811
Benjamin Peterson29060642009-01-31 22:14:21 +00006812#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006813 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006814
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006815PyObject *PyUnicode_RichCompare(PyObject *left,
6816 PyObject *right,
6817 int op)
6818{
6819 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006820
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006821 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6822 PyObject *v;
6823 if (((PyUnicodeObject *) left)->length !=
6824 ((PyUnicodeObject *) right)->length) {
6825 if (op == Py_EQ) {
6826 Py_INCREF(Py_False);
6827 return Py_False;
6828 }
6829 if (op == Py_NE) {
6830 Py_INCREF(Py_True);
6831 return Py_True;
6832 }
6833 }
6834 if (left == right)
6835 result = 0;
6836 else
6837 result = unicode_compare((PyUnicodeObject *)left,
6838 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006839
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006840 /* Convert the return value to a Boolean */
6841 switch (op) {
6842 case Py_EQ:
6843 v = TEST_COND(result == 0);
6844 break;
6845 case Py_NE:
6846 v = TEST_COND(result != 0);
6847 break;
6848 case Py_LE:
6849 v = TEST_COND(result <= 0);
6850 break;
6851 case Py_GE:
6852 v = TEST_COND(result >= 0);
6853 break;
6854 case Py_LT:
6855 v = TEST_COND(result == -1);
6856 break;
6857 case Py_GT:
6858 v = TEST_COND(result == 1);
6859 break;
6860 default:
6861 PyErr_BadArgument();
6862 return NULL;
6863 }
6864 Py_INCREF(v);
6865 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006866 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006868 Py_INCREF(Py_NotImplemented);
6869 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006870}
6871
Guido van Rossum403d68b2000-03-13 15:55:09 +00006872int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006874{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006875 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006876 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006877
6878 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006879 sub = PyUnicode_FromObject(element);
6880 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 PyErr_Format(PyExc_TypeError,
6882 "'in <string>' requires string as left operand, not %s",
6883 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006884 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006885 }
6886
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887 str = PyUnicode_FromObject(container);
6888 if (!str) {
6889 Py_DECREF(sub);
6890 return -1;
6891 }
6892
6893 result = stringlib_contains_obj(str, sub);
6894
6895 Py_DECREF(str);
6896 Py_DECREF(sub);
6897
Guido van Rossum403d68b2000-03-13 15:55:09 +00006898 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006899}
6900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901/* Concat to string or Unicode object giving a new Unicode object. */
6902
6903PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 PyUnicodeObject *u = NULL, *v = NULL, *w;
6907
6908 /* Coerce the two arguments */
6909 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6910 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6913 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916 /* Shortcuts */
6917 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 Py_DECREF(v);
6919 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 }
6921 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 Py_DECREF(u);
6923 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 }
6925
6926 /* Concat the two Unicode strings */
6927 w = _PyUnicode_New(u->length + v->length);
6928 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 Py_UNICODE_COPY(w->str, u->str, u->length);
6931 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6932
6933 Py_DECREF(u);
6934 Py_DECREF(v);
6935 return (PyObject *)w;
6936
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 Py_XDECREF(u);
6939 Py_XDECREF(v);
6940 return NULL;
6941}
6942
Walter Dörwald1ab83302007-05-18 17:15:44 +00006943void
6944PyUnicode_Append(PyObject **pleft, PyObject *right)
6945{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006946 PyObject *new;
6947 if (*pleft == NULL)
6948 return;
6949 if (right == NULL || !PyUnicode_Check(*pleft)) {
6950 Py_DECREF(*pleft);
6951 *pleft = NULL;
6952 return;
6953 }
6954 new = PyUnicode_Concat(*pleft, right);
6955 Py_DECREF(*pleft);
6956 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006957}
6958
6959void
6960PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6961{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 PyUnicode_Append(pleft, right);
6963 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006964}
6965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006969Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006970string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject *
6974unicode_count(PyUnicodeObject *self, PyObject *args)
6975{
6976 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006977 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006978 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 PyObject *result;
6980
Guido van Rossumb8872e62000-05-09 14:14:27 +00006981 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 return NULL;
6984
6985 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006986 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006989
Thomas Wouters477c8d52006-05-27 19:21:47 +00006990 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
Christian Heimes217cfd12007-12-02 14:31:20 +00006992 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006993 stringlib_count(self->str + start, end - start,
6994 substring->str, substring->length)
6995 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006998
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return result;
7000}
7001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007002PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007005Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007006to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007007handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007008a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7009'xmlcharrefreplace' as well as any other name registered with\n\
7010codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
7012static PyObject *
7013unicode_encode(PyUnicodeObject *self, PyObject *args)
7014{
7015 char *encoding = NULL;
7016 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007017 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007018
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7020 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007021 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007022 if (v == NULL)
7023 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007024 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007025 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007026 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007027 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007028 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007029 Py_DECREF(v);
7030 return NULL;
7031 }
7032 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007033
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007035 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007036}
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
7041Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007042If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044static PyObject*
7045unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7046{
7047 Py_UNICODE *e;
7048 Py_UNICODE *p;
7049 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007050 Py_UNICODE *qe;
7051 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 PyUnicodeObject *u;
7053 int tabsize = 8;
7054
7055 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057
Thomas Wouters7e474022000-07-16 12:04:32 +00007058 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007059 i = 0; /* chars up to and including most recent \n or \r */
7060 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7061 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 for (p = self->str; p < e; p++)
7063 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 if (tabsize > 0) {
7065 incr = tabsize - (j % tabsize); /* cannot overflow */
7066 if (j > PY_SSIZE_T_MAX - incr)
7067 goto overflow1;
7068 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007069 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 if (j > PY_SSIZE_T_MAX - 1)
7073 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 j++;
7075 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 if (i > PY_SSIZE_T_MAX - j)
7077 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007079 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 }
7081 }
7082
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007083 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007085
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 /* Second pass: create output string and fill it */
7087 u = _PyUnicode_New(i + j);
7088 if (!u)
7089 return NULL;
7090
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007091 j = 0; /* same as in first pass */
7092 q = u->str; /* next output char */
7093 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094
7095 for (p = self->str; p < e; p++)
7096 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 if (tabsize > 0) {
7098 i = tabsize - (j % tabsize);
7099 j += i;
7100 while (i--) {
7101 if (q >= qe)
7102 goto overflow2;
7103 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007104 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 else {
7108 if (q >= qe)
7109 goto overflow2;
7110 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007111 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 if (*p == '\n' || *p == '\r')
7113 j = 0;
7114 }
7115
7116 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007117
7118 overflow2:
7119 Py_DECREF(u);
7120 overflow1:
7121 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123}
7124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127\n\
7128Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007129such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130arguments start and end are interpreted as in slice notation.\n\
7131\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject *
7135unicode_find(PyUnicodeObject *self, PyObject *args)
7136{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007137 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007138 Py_ssize_t start;
7139 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007140 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
Christian Heimes9cd17752007-11-18 19:35:23 +00007142 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144
Thomas Wouters477c8d52006-05-27 19:21:47 +00007145 result = stringlib_find_slice(
7146 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7147 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7148 start, end
7149 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152
Christian Heimes217cfd12007-12-02 14:31:20 +00007153 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154}
7155
7156static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
7159 if (index < 0 || index >= self->length) {
7160 PyErr_SetString(PyExc_IndexError, "string index out of range");
7161 return NULL;
7162 }
7163
7164 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7165}
7166
Guido van Rossumc2504932007-09-18 19:42:40 +00007167/* Believe it or not, this produces the same value for ASCII strings
7168 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007170unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171{
Guido van Rossumc2504932007-09-18 19:42:40 +00007172 Py_ssize_t len;
7173 Py_UNICODE *p;
7174 long x;
7175
7176 if (self->hash != -1)
7177 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007178 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007179 p = self->str;
7180 x = *p << 7;
7181 while (--len >= 0)
7182 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007183 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007184 if (x == -1)
7185 x = -2;
7186 self->hash = x;
7187 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188}
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194
7195static PyObject *
7196unicode_index(PyUnicodeObject *self, PyObject *args)
7197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007199 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007200 Py_ssize_t start;
7201 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
Christian Heimes9cd17752007-11-18 19:35:23 +00007203 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205
Thomas Wouters477c8d52006-05-27 19:21:47 +00007206 result = stringlib_find_slice(
7207 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7208 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7209 start, end
7210 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (result < 0) {
7215 PyErr_SetString(PyExc_ValueError, "substring not found");
7216 return NULL;
7217 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007218
Christian Heimes217cfd12007-12-02 14:31:20 +00007219 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007222PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007225Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007226at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007229unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230{
7231 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7232 register const Py_UNICODE *e;
7233 int cased;
7234
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 /* Shortcut for single character strings */
7236 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007239 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007240 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007242
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 e = p + PyUnicode_GET_SIZE(self);
7244 cased = 0;
7245 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007247
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7249 return PyBool_FromLong(0);
7250 else if (!cased && Py_UNICODE_ISLOWER(ch))
7251 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007253 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254}
7255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007256PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007259Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007260at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
7262static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007263unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264{
7265 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7266 register const Py_UNICODE *e;
7267 int cased;
7268
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 /* Shortcut for single character strings */
7270 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007273 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007274 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007276
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 e = p + PyUnicode_GET_SIZE(self);
7278 cased = 0;
7279 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007281
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7283 return PyBool_FromLong(0);
7284 else if (!cased && Py_UNICODE_ISUPPER(ch))
7285 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007287 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007293Return True if S is a titlecased string and there is at least one\n\
7294character in S, i.e. upper- and titlecase characters may only\n\
7295follow uncased characters and lowercase characters only cased ones.\n\
7296Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007299unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
7301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7302 register const Py_UNICODE *e;
7303 int cased, previous_is_cased;
7304
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 /* Shortcut for single character strings */
7306 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7308 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007310 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007311 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007313
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 e = p + PyUnicode_GET_SIZE(self);
7315 cased = 0;
7316 previous_is_cased = 0;
7317 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007319
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7321 if (previous_is_cased)
7322 return PyBool_FromLong(0);
7323 previous_is_cased = 1;
7324 cased = 1;
7325 }
7326 else if (Py_UNICODE_ISLOWER(ch)) {
7327 if (!previous_is_cased)
7328 return PyBool_FromLong(0);
7329 previous_is_cased = 1;
7330 cased = 1;
7331 }
7332 else
7333 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007335 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336}
7337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007338PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007341Return True if all characters in S are whitespace\n\
7342and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
7344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007345unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346{
7347 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7348 register const Py_UNICODE *e;
7349
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 /* Shortcut for single character strings */
7351 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 Py_UNICODE_ISSPACE(*p))
7353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007355 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007356 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007358
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 e = p + PyUnicode_GET_SIZE(self);
7360 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 if (!Py_UNICODE_ISSPACE(*p))
7362 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007364 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365}
7366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007367PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007369\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007370Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007371and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007372
7373static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007374unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007375{
7376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7377 register const Py_UNICODE *e;
7378
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007379 /* Shortcut for single character strings */
7380 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 Py_UNICODE_ISALPHA(*p))
7382 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007383
7384 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007385 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007387
7388 e = p + PyUnicode_GET_SIZE(self);
7389 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 if (!Py_UNICODE_ISALPHA(*p))
7391 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007392 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007393 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007394}
7395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007398\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007399Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007400and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007401
7402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007403unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007404{
7405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7406 register const Py_UNICODE *e;
7407
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408 /* Shortcut for single character strings */
7409 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 Py_UNICODE_ISALNUM(*p))
7411 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007412
7413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007414 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007416
7417 e = p + PyUnicode_GET_SIZE(self);
7418 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 if (!Py_UNICODE_ISALNUM(*p))
7420 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007422 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007423}
7424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007425PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007428Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007429False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
7431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007432unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433{
7434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7435 register const Py_UNICODE *e;
7436
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 /* Shortcut for single character strings */
7438 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 Py_UNICODE_ISDECIMAL(*p))
7440 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007442 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007443 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007445
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 e = p + PyUnicode_GET_SIZE(self);
7447 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 if (!Py_UNICODE_ISDECIMAL(*p))
7449 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007451 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452}
7453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007457Return True if all characters in S are digits\n\
7458and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
7460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007461unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462{
7463 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7464 register const Py_UNICODE *e;
7465
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 /* Shortcut for single character strings */
7467 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 Py_UNICODE_ISDIGIT(*p))
7469 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007471 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007472 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 e = p + PyUnicode_GET_SIZE(self);
7476 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 if (!Py_UNICODE_ISDIGIT(*p))
7478 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007480 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481}
7482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007483PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007486Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007487False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
7489static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007490unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491{
7492 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7493 register const Py_UNICODE *e;
7494
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 /* Shortcut for single character strings */
7496 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 Py_UNICODE_ISNUMERIC(*p))
7498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007500 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007501 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007503
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 e = p + PyUnicode_GET_SIZE(self);
7505 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 if (!Py_UNICODE_ISNUMERIC(*p))
7507 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007509 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510}
7511
Martin v. Löwis47383402007-08-15 07:32:56 +00007512int
7513PyUnicode_IsIdentifier(PyObject *self)
7514{
7515 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7516 register const Py_UNICODE *e;
7517
7518 /* Special case for empty strings */
7519 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007521
7522 /* PEP 3131 says that the first character must be in
7523 XID_Start and subsequent characters in XID_Continue,
7524 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007525 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007526 letters, digits, underscore). However, given the current
7527 definition of XID_Start and XID_Continue, it is sufficient
7528 to check just for these, except that _ must be allowed
7529 as starting an identifier. */
7530 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7531 return 0;
7532
7533 e = p + PyUnicode_GET_SIZE(self);
7534 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 if (!_PyUnicode_IsXidContinue(*p))
7536 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007537 }
7538 return 1;
7539}
7540
7541PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007543\n\
7544Return True if S is a valid identifier according\n\
7545to the language definition.");
7546
7547static PyObject*
7548unicode_isidentifier(PyObject *self)
7549{
7550 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7551}
7552
Georg Brandl559e5d72008-06-11 18:37:52 +00007553PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007555\n\
7556Return True if all characters in S are considered\n\
7557printable in repr() or S is empty, False otherwise.");
7558
7559static PyObject*
7560unicode_isprintable(PyObject *self)
7561{
7562 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7563 register const Py_UNICODE *e;
7564
7565 /* Shortcut for single character strings */
7566 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7567 Py_RETURN_TRUE;
7568 }
7569
7570 e = p + PyUnicode_GET_SIZE(self);
7571 for (; p < e; p++) {
7572 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7573 Py_RETURN_FALSE;
7574 }
7575 }
7576 Py_RETURN_TRUE;
7577}
7578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007579PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581\n\
7582Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007586unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007588 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589}
7590
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592unicode_length(PyUnicodeObject *self)
7593{
7594 return self->length;
7595}
7596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007600Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007601done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
7603static PyObject *
7604unicode_ljust(PyUnicodeObject *self, PyObject *args)
7605{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007606 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007607 Py_UNICODE fillchar = ' ';
7608
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007609 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 return NULL;
7611
Tim Peters7a29bd52001-09-12 03:03:31 +00007612 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 Py_INCREF(self);
7614 return (PyObject*) self;
7615 }
7616
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007617 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618}
7619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007623Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
7625static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007626unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 return fixup(self, fixlower);
7629}
7630
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007631#define LEFTSTRIP 0
7632#define RIGHTSTRIP 1
7633#define BOTHSTRIP 2
7634
7635/* Arrays indexed by above */
7636static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7637
7638#define STRIPNAME(i) (stripformat[i]+3)
7639
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007640/* externally visible for str.strip(unicode) */
7641PyObject *
7642_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7643{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7645 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7646 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7647 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7648 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007649
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007651
Benjamin Peterson14339b62009-01-31 16:36:08 +00007652 i = 0;
7653 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7655 i++;
7656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007657 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007658
Benjamin Peterson14339b62009-01-31 16:36:08 +00007659 j = len;
7660 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 do {
7662 j--;
7663 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7664 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007666
Benjamin Peterson14339b62009-01-31 16:36:08 +00007667 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 Py_INCREF(self);
7669 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007670 }
7671 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007673}
7674
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007677do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7680 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007681
Benjamin Peterson14339b62009-01-31 16:36:08 +00007682 i = 0;
7683 if (striptype != RIGHTSTRIP) {
7684 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7685 i++;
7686 }
7687 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007688
Benjamin Peterson14339b62009-01-31 16:36:08 +00007689 j = len;
7690 if (striptype != LEFTSTRIP) {
7691 do {
7692 j--;
7693 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7694 j++;
7695 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007696
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7698 Py_INCREF(self);
7699 return (PyObject*)self;
7700 }
7701 else
7702 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703}
7704
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705
7706static PyObject *
7707do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007710
Benjamin Peterson14339b62009-01-31 16:36:08 +00007711 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7712 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714 if (sep != NULL && sep != Py_None) {
7715 if (PyUnicode_Check(sep))
7716 return _PyUnicode_XStrip(self, striptype, sep);
7717 else {
7718 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 "%s arg must be None or str",
7720 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007721 return NULL;
7722 }
7723 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007724
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007726}
7727
7728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007729PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007731\n\
7732Return a copy of the string S with leading and trailing\n\
7733whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007734If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007735
7736static PyObject *
7737unicode_strip(PyUnicodeObject *self, PyObject *args)
7738{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 if (PyTuple_GET_SIZE(args) == 0)
7740 return do_strip(self, BOTHSTRIP); /* Common case */
7741 else
7742 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007743}
7744
7745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007748\n\
7749Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007750If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751
7752static PyObject *
7753unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7754{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 if (PyTuple_GET_SIZE(args) == 0)
7756 return do_strip(self, LEFTSTRIP); /* Common case */
7757 else
7758 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759}
7760
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007764\n\
7765Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007766If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007767
7768static PyObject *
7769unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 if (PyTuple_GET_SIZE(args) == 0)
7772 return do_strip(self, RIGHTSTRIP); /* Common case */
7773 else
7774 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775}
7776
7777
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007779unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780{
7781 PyUnicodeObject *u;
7782 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007783 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007784 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
Georg Brandl222de0f2009-04-12 12:01:50 +00007786 if (len < 1) {
7787 Py_INCREF(unicode_empty);
7788 return (PyObject *)unicode_empty;
7789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
Tim Peters7a29bd52001-09-12 03:03:31 +00007791 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 /* no repeat, return original string */
7793 Py_INCREF(str);
7794 return (PyObject*) str;
7795 }
Tim Peters8f422462000-09-09 06:13:41 +00007796
7797 /* ensure # of chars needed doesn't overflow int and # of bytes
7798 * needed doesn't overflow size_t
7799 */
7800 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007801 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007802 PyErr_SetString(PyExc_OverflowError,
7803 "repeated string is too long");
7804 return NULL;
7805 }
7806 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7807 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7808 PyErr_SetString(PyExc_OverflowError,
7809 "repeated string is too long");
7810 return NULL;
7811 }
7812 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 if (!u)
7814 return NULL;
7815
7816 p = u->str;
7817
Georg Brandl222de0f2009-04-12 12:01:50 +00007818 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007819 Py_UNICODE_FILL(p, str->str[0], len);
7820 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007821 Py_ssize_t done = str->length; /* number of characters copied this far */
7822 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007824 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007825 Py_UNICODE_COPY(p+done, p, n);
7826 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 }
7829
7830 return (PyObject*) u;
7831}
7832
7833PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 PyObject *subobj,
7835 PyObject *replobj,
7836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837{
7838 PyObject *self;
7839 PyObject *str1;
7840 PyObject *str2;
7841 PyObject *result;
7842
7843 self = PyUnicode_FromObject(obj);
7844 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 str1 = PyUnicode_FromObject(subobj);
7847 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 Py_DECREF(self);
7849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 }
7851 str2 = PyUnicode_FromObject(replobj);
7852 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 Py_DECREF(self);
7854 Py_DECREF(str1);
7855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 }
Tim Petersced69f82003-09-16 20:30:58 +00007857 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 (PyUnicodeObject *)str1,
7859 (PyUnicodeObject *)str2,
7860 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 Py_DECREF(self);
7862 Py_DECREF(str1);
7863 Py_DECREF(str2);
7864 return result;
7865}
7866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007867PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869\n\
7870Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007871old replaced by new. If the optional argument count is\n\
7872given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
7874static PyObject*
7875unicode_replace(PyUnicodeObject *self, PyObject *args)
7876{
7877 PyUnicodeObject *str1;
7878 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007879 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 PyObject *result;
7881
Martin v. Löwis18e16552006-02-15 17:27:45 +00007882 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 return NULL;
7884 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7885 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007888 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 Py_DECREF(str1);
7890 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
7893 result = replace(self, str1, str2, maxcount);
7894
7895 Py_DECREF(str1);
7896 Py_DECREF(str2);
7897 return result;
7898}
7899
7900static
7901PyObject *unicode_repr(PyObject *unicode)
7902{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007903 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007904 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007905 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7906 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7907
7908 /* XXX(nnorwitz): rather than over-allocating, it would be
7909 better to choose a different scheme. Perhaps scan the
7910 first N-chars of the string and allocate based on that size.
7911 */
7912 /* Initial allocation is based on the longest-possible unichr
7913 escape.
7914
7915 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7916 unichr, so in this case it's the longest unichr escape. In
7917 narrow (UTF-16) builds this is five chars per source unichr
7918 since there are two unichrs in the surrogate pair, so in narrow
7919 (UTF-16) builds it's not the longest unichr escape.
7920
7921 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7922 so in the narrow (UTF-16) build case it's the longest unichr
7923 escape.
7924 */
7925
Walter Dörwald1ab83302007-05-18 17:15:44 +00007926 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007928#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007930#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007932#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007934 if (repr == NULL)
7935 return NULL;
7936
Walter Dörwald1ab83302007-05-18 17:15:44 +00007937 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007938
7939 /* Add quote */
7940 *p++ = (findchar(s, size, '\'') &&
7941 !findchar(s, size, '"')) ? '"' : '\'';
7942 while (size-- > 0) {
7943 Py_UNICODE ch = *s++;
7944
7945 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007946 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007947 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007948 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007949 continue;
7950 }
7951
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007953 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007954 *p++ = '\\';
7955 *p++ = 't';
7956 }
7957 else if (ch == '\n') {
7958 *p++ = '\\';
7959 *p++ = 'n';
7960 }
7961 else if (ch == '\r') {
7962 *p++ = '\\';
7963 *p++ = 'r';
7964 }
7965
7966 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007967 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007968 *p++ = '\\';
7969 *p++ = 'x';
7970 *p++ = hexdigits[(ch >> 4) & 0x000F];
7971 *p++ = hexdigits[ch & 0x000F];
7972 }
7973
Georg Brandl559e5d72008-06-11 18:37:52 +00007974 /* Copy ASCII characters as-is */
7975 else if (ch < 0x7F) {
7976 *p++ = ch;
7977 }
7978
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007980 else {
7981 Py_UCS4 ucs = ch;
7982
7983#ifndef Py_UNICODE_WIDE
7984 Py_UNICODE ch2 = 0;
7985 /* Get code point from surrogate pair */
7986 if (size > 0) {
7987 ch2 = *s;
7988 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007992 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007993 size--;
7994 }
7995 }
7996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007997 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007998 (categories Z* and C* except ASCII space)
7999 */
8000 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8001 /* Map 8-bit characters to '\xhh' */
8002 if (ucs <= 0xff) {
8003 *p++ = '\\';
8004 *p++ = 'x';
8005 *p++ = hexdigits[(ch >> 4) & 0x000F];
8006 *p++ = hexdigits[ch & 0x000F];
8007 }
8008 /* Map 21-bit characters to '\U00xxxxxx' */
8009 else if (ucs >= 0x10000) {
8010 *p++ = '\\';
8011 *p++ = 'U';
8012 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8013 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8014 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8015 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8016 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8017 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8018 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8019 *p++ = hexdigits[ucs & 0x0000000F];
8020 }
8021 /* Map 16-bit characters to '\uxxxx' */
8022 else {
8023 *p++ = '\\';
8024 *p++ = 'u';
8025 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8026 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8027 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8028 *p++ = hexdigits[ucs & 0x000F];
8029 }
8030 }
8031 /* Copy characters as-is */
8032 else {
8033 *p++ = ch;
8034#ifndef Py_UNICODE_WIDE
8035 if (ucs >= 0x10000)
8036 *p++ = ch2;
8037#endif
8038 }
8039 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008040 }
8041 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008042 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008043
8044 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008045 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008046 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047}
8048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008049PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051\n\
8052Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008053such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054arguments start and end are interpreted as in slice notation.\n\
8055\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008056Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
8058static PyObject *
8059unicode_rfind(PyUnicodeObject *self, PyObject *args)
8060{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008061 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008062 Py_ssize_t start;
8063 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008064 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065
Christian Heimes9cd17752007-11-18 19:35:23 +00008066 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068
Thomas Wouters477c8d52006-05-27 19:21:47 +00008069 result = stringlib_rfind_slice(
8070 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8071 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8072 start, end
8073 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074
8075 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008076
Christian Heimes217cfd12007-12-02 14:31:20 +00008077 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078}
8079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008080PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008083Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
8085static PyObject *
8086unicode_rindex(PyUnicodeObject *self, PyObject *args)
8087{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008088 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008089 Py_ssize_t start;
8090 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008091 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092
Christian Heimes9cd17752007-11-18 19:35:23 +00008093 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095
Thomas Wouters477c8d52006-05-27 19:21:47 +00008096 result = stringlib_rfind_slice(
8097 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8098 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8099 start, end
8100 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101
8102 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008103
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 if (result < 0) {
8105 PyErr_SetString(PyExc_ValueError, "substring not found");
8106 return NULL;
8107 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008108 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109}
8110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008111PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008114Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008115done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116
8117static PyObject *
8118unicode_rjust(PyUnicodeObject *self, PyObject *args)
8119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008120 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008121 Py_UNICODE fillchar = ' ';
8122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008123 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 return NULL;
8125
Tim Peters7a29bd52001-09-12 03:03:31 +00008126 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 Py_INCREF(self);
8128 return (PyObject*) self;
8129 }
8130
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008131 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132}
8133
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 PyObject *sep,
8136 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
8138 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008139
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 s = PyUnicode_FromObject(s);
8141 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 if (sep != NULL) {
8144 sep = PyUnicode_FromObject(sep);
8145 if (sep == NULL) {
8146 Py_DECREF(s);
8147 return NULL;
8148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 }
8150
8151 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8152
8153 Py_DECREF(s);
8154 Py_XDECREF(sep);
8155 return result;
8156}
8157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008158PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160\n\
8161Return a list of the words in S, using sep as the\n\
8162delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008163splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008164whitespace string is a separator and empty strings are\n\
8165removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167static PyObject*
8168unicode_split(PyUnicodeObject *self, PyObject *args)
8169{
8170 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008171 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
Martin v. Löwis18e16552006-02-15 17:27:45 +00008173 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 return NULL;
8175
8176 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182}
8183
Thomas Wouters477c8d52006-05-27 19:21:47 +00008184PyObject *
8185PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8186{
8187 PyObject* str_obj;
8188 PyObject* sep_obj;
8189 PyObject* out;
8190
8191 str_obj = PyUnicode_FromObject(str_in);
8192 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194 sep_obj = PyUnicode_FromObject(sep_in);
8195 if (!sep_obj) {
8196 Py_DECREF(str_obj);
8197 return NULL;
8198 }
8199
8200 out = stringlib_partition(
8201 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8202 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8203 );
8204
8205 Py_DECREF(sep_obj);
8206 Py_DECREF(str_obj);
8207
8208 return out;
8209}
8210
8211
8212PyObject *
8213PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8214{
8215 PyObject* str_obj;
8216 PyObject* sep_obj;
8217 PyObject* out;
8218
8219 str_obj = PyUnicode_FromObject(str_in);
8220 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222 sep_obj = PyUnicode_FromObject(sep_in);
8223 if (!sep_obj) {
8224 Py_DECREF(str_obj);
8225 return NULL;
8226 }
8227
8228 out = stringlib_rpartition(
8229 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8230 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8231 );
8232
8233 Py_DECREF(sep_obj);
8234 Py_DECREF(str_obj);
8235
8236 return out;
8237}
8238
8239PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008242Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008244found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008245
8246static PyObject*
8247unicode_partition(PyUnicodeObject *self, PyObject *separator)
8248{
8249 return PyUnicode_Partition((PyObject *)self, separator);
8250}
8251
8252PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008254\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008255Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008256the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008257separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008258
8259static PyObject*
8260unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8261{
8262 return PyUnicode_RPartition((PyObject *)self, separator);
8263}
8264
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008265PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 PyObject *sep,
8267 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008268{
8269 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008271 s = PyUnicode_FromObject(s);
8272 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008273 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 if (sep != NULL) {
8275 sep = PyUnicode_FromObject(sep);
8276 if (sep == NULL) {
8277 Py_DECREF(s);
8278 return NULL;
8279 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008280 }
8281
8282 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8283
8284 Py_DECREF(s);
8285 Py_XDECREF(sep);
8286 return result;
8287}
8288
8289PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008291\n\
8292Return a list of the words in S, using sep as the\n\
8293delimiter string, starting at the end of the string and\n\
8294working to the front. If maxsplit is given, at most maxsplit\n\
8295splits are done. If sep is not specified, any whitespace string\n\
8296is a separator.");
8297
8298static PyObject*
8299unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8300{
8301 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008303
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008305 return NULL;
8306
8307 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008309 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008311 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008313}
8314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008315PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317\n\
8318Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008319Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008320is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
8322static PyObject*
8323unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8324{
Guido van Rossum86662912000-04-11 15:38:46 +00008325 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326
Guido van Rossum86662912000-04-11 15:38:46 +00008327 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 return NULL;
8329
Guido van Rossum86662912000-04-11 15:38:46 +00008330 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331}
8332
8333static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008334PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335{
Walter Dörwald346737f2007-05-31 10:44:43 +00008336 if (PyUnicode_CheckExact(self)) {
8337 Py_INCREF(self);
8338 return self;
8339 } else
8340 /* Subtype -- return genuine unicode string with the same value. */
8341 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8342 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343}
8344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008345PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347\n\
8348Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008349and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350
8351static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008352unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 return fixup(self, fixswapcase);
8355}
8356
Georg Brandlceee0772007-11-27 23:48:05 +00008357PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008359\n\
8360Return a translation table usable for str.translate().\n\
8361If there is only one argument, it must be a dictionary mapping Unicode\n\
8362ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008363Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008364If there are two arguments, they must be strings of equal length, and\n\
8365in the resulting dictionary, each character in x will be mapped to the\n\
8366character at the same position in y. If there is a third argument, it\n\
8367must be a string, whose characters will be mapped to None in the result.");
8368
8369static PyObject*
8370unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8371{
8372 PyObject *x, *y = NULL, *z = NULL;
8373 PyObject *new = NULL, *key, *value;
8374 Py_ssize_t i = 0;
8375 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376
Georg Brandlceee0772007-11-27 23:48:05 +00008377 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8378 return NULL;
8379 new = PyDict_New();
8380 if (!new)
8381 return NULL;
8382 if (y != NULL) {
8383 /* x must be a string too, of equal length */
8384 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8385 if (!PyUnicode_Check(x)) {
8386 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8387 "be a string if there is a second argument");
8388 goto err;
8389 }
8390 if (PyUnicode_GET_SIZE(x) != ylen) {
8391 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8392 "arguments must have equal length");
8393 goto err;
8394 }
8395 /* create entries for translating chars in x to those in y */
8396 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008397 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8398 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008399 if (!key || !value)
8400 goto err;
8401 res = PyDict_SetItem(new, key, value);
8402 Py_DECREF(key);
8403 Py_DECREF(value);
8404 if (res < 0)
8405 goto err;
8406 }
8407 /* create entries for deleting chars in z */
8408 if (z != NULL) {
8409 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008410 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008411 if (!key)
8412 goto err;
8413 res = PyDict_SetItem(new, key, Py_None);
8414 Py_DECREF(key);
8415 if (res < 0)
8416 goto err;
8417 }
8418 }
8419 } else {
8420 /* x must be a dict */
8421 if (!PyDict_Check(x)) {
8422 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8423 "to maketrans it must be a dict");
8424 goto err;
8425 }
8426 /* copy entries into the new dict, converting string keys to int keys */
8427 while (PyDict_Next(x, &i, &key, &value)) {
8428 if (PyUnicode_Check(key)) {
8429 /* convert string keys to integer keys */
8430 PyObject *newkey;
8431 if (PyUnicode_GET_SIZE(key) != 1) {
8432 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8433 "table must be of length 1");
8434 goto err;
8435 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008436 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008437 if (!newkey)
8438 goto err;
8439 res = PyDict_SetItem(new, newkey, value);
8440 Py_DECREF(newkey);
8441 if (res < 0)
8442 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008443 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008444 /* just keep integer keys */
8445 if (PyDict_SetItem(new, key, value) < 0)
8446 goto err;
8447 } else {
8448 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8449 "be strings or integers");
8450 goto err;
8451 }
8452 }
8453 }
8454 return new;
8455 err:
8456 Py_DECREF(new);
8457 return NULL;
8458}
8459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008460PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462\n\
8463Return a copy of the string S, where all characters have been mapped\n\
8464through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008465Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008466Unmapped characters are left untouched. Characters mapped to None\n\
8467are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468
8469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008470unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471{
Georg Brandlceee0772007-11-27 23:48:05 +00008472 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473}
8474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008475PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008478Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479
8480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008481unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 return fixup(self, fixupper);
8484}
8485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008486PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008489Pad a numeric string S with zeros on the left, to fill a field\n\
8490of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
8492static PyObject *
8493unicode_zfill(PyUnicodeObject *self, PyObject *args)
8494{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008495 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 PyUnicodeObject *u;
8497
Martin v. Löwis18e16552006-02-15 17:27:45 +00008498 Py_ssize_t width;
8499 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 return NULL;
8501
8502 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008503 if (PyUnicode_CheckExact(self)) {
8504 Py_INCREF(self);
8505 return (PyObject*) self;
8506 }
8507 else
8508 return PyUnicode_FromUnicode(
8509 PyUnicode_AS_UNICODE(self),
8510 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 }
8513
8514 fill = width - self->length;
8515
8516 u = pad(self, fill, 0, '0');
8517
Walter Dörwald068325e2002-04-15 13:36:47 +00008518 if (u == NULL)
8519 return NULL;
8520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (u->str[fill] == '+' || u->str[fill] == '-') {
8522 /* move sign to beginning of string */
8523 u->str[0] = u->str[fill];
8524 u->str[fill] = '0';
8525 }
8526
8527 return (PyObject*) u;
8528}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
8530#if 0
8531static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008532unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533{
Christian Heimes2202f872008-02-06 14:31:34 +00008534 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536#endif
8537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008538PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008541Return True if S starts with the specified prefix, False otherwise.\n\
8542With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543With optional end, stop comparing S at that position.\n\
8544prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
8546static PyObject *
8547unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008550 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008553 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008554 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008556 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8558 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008559 if (PyTuple_Check(subobj)) {
8560 Py_ssize_t i;
8561 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8562 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008564 if (substring == NULL)
8565 return NULL;
8566 result = tailmatch(self, substring, start, end, -1);
8567 Py_DECREF(substring);
8568 if (result) {
8569 Py_RETURN_TRUE;
8570 }
8571 }
8572 /* nothing matched */
8573 Py_RETURN_FALSE;
8574 }
8575 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008578 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008580 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581}
8582
8583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008584PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008587Return True if S ends with the specified suffix, False otherwise.\n\
8588With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008589With optional end, stop comparing S at that position.\n\
8590suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
8592static PyObject *
8593unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008596 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008598 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008599 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008600 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008602 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8604 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008605 if (PyTuple_Check(subobj)) {
8606 Py_ssize_t i;
8607 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8608 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008610 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008612 result = tailmatch(self, substring, start, end, +1);
8613 Py_DECREF(substring);
8614 if (result) {
8615 Py_RETURN_TRUE;
8616 }
8617 }
8618 Py_RETURN_FALSE;
8619 }
8620 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008624 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008626 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Eric Smith8c663262007-08-25 02:26:07 +00008629#include "stringlib/string_format.h"
8630
8631PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008633\n\
8634");
8635
Eric Smith4a7d76d2008-05-30 18:10:19 +00008636static PyObject *
8637unicode__format__(PyObject* self, PyObject* args)
8638{
8639 PyObject *format_spec;
8640
8641 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8642 return NULL;
8643
8644 return _PyUnicode_FormatAdvanced(self,
8645 PyUnicode_AS_UNICODE(format_spec),
8646 PyUnicode_GET_SIZE(format_spec));
8647}
8648
Eric Smith8c663262007-08-25 02:26:07 +00008649PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008651\n\
8652");
8653
8654static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008655unicode__sizeof__(PyUnicodeObject *v)
8656{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008657 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8658 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008659}
8660
8661PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008663
8664static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008665unicode_getnewargs(PyUnicodeObject *v)
8666{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008667 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008668}
8669
8670
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671static PyMethodDef unicode_methods[] = {
8672
8673 /* Order is according to common usage: often used methods should
8674 appear first, since lookup is done sequentially. */
8675
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008676 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8677 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8678 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008679 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008680 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8681 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8682 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8683 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8684 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8685 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8686 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008687 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008688 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8689 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8690 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008691 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008692 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8693 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8694 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008695 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008696 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008697 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008698 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008699 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8700 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8701 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8702 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8703 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8704 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8705 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8706 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8707 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8708 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8709 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8710 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8711 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8712 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008713 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008714 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008715 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008716 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008717 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008718 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8719 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008720 {"maketrans", (PyCFunction) unicode_maketrans,
8721 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008722 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008723#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008724 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725#endif
8726
8727#if 0
8728 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008729 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730#endif
8731
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 {NULL, NULL}
8734};
8735
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008736static PyObject *
8737unicode_mod(PyObject *v, PyObject *w)
8738{
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 if (!PyUnicode_Check(v)) {
8740 Py_INCREF(Py_NotImplemented);
8741 return Py_NotImplemented;
8742 }
8743 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008744}
8745
8746static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008747 0, /*nb_add*/
8748 0, /*nb_subtract*/
8749 0, /*nb_multiply*/
8750 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008751};
8752
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008754 (lenfunc) unicode_length, /* sq_length */
8755 PyUnicode_Concat, /* sq_concat */
8756 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8757 (ssizeargfunc) unicode_getitem, /* sq_item */
8758 0, /* sq_slice */
8759 0, /* sq_ass_item */
8760 0, /* sq_ass_slice */
8761 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762};
8763
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008764static PyObject*
8765unicode_subscript(PyUnicodeObject* self, PyObject* item)
8766{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008767 if (PyIndex_Check(item)) {
8768 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008769 if (i == -1 && PyErr_Occurred())
8770 return NULL;
8771 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008772 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008773 return unicode_getitem(self, i);
8774 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008775 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008776 Py_UNICODE* source_buf;
8777 Py_UNICODE* result_buf;
8778 PyObject* result;
8779
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008780 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008782 return NULL;
8783 }
8784
8785 if (slicelength <= 0) {
8786 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008787 } else if (start == 0 && step == 1 && slicelength == self->length &&
8788 PyUnicode_CheckExact(self)) {
8789 Py_INCREF(self);
8790 return (PyObject *)self;
8791 } else if (step == 1) {
8792 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008793 } else {
8794 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008795 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8796 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008797
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 if (result_buf == NULL)
8799 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008800
8801 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8802 result_buf[i] = source_buf[cur];
8803 }
Tim Petersced69f82003-09-16 20:30:58 +00008804
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008805 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008806 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008807 return result;
8808 }
8809 } else {
8810 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8811 return NULL;
8812 }
8813}
8814
8815static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816 (lenfunc)unicode_length, /* mp_length */
8817 (binaryfunc)unicode_subscript, /* mp_subscript */
8818 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008819};
8820
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822/* Helpers for PyUnicode_Format() */
8823
8824static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008825getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008827 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 (*p_argidx)++;
8830 if (arglen < 0)
8831 return args;
8832 else
8833 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 }
8835 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837 return NULL;
8838}
8839
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008840/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008842static PyObject *
8843formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008845 char *p;
8846 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008848
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 x = PyFloat_AsDouble(v);
8850 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008851 return NULL;
8852
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008855
Eric Smith0923d1d2009-04-16 20:16:10 +00008856 p = PyOS_double_to_string(x, type, prec,
8857 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008858 if (p == NULL)
8859 return NULL;
8860 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008861 PyMem_Free(p);
8862 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863}
8864
Tim Peters38fd5b62000-09-21 05:43:11 +00008865static PyObject*
8866formatlong(PyObject *val, int flags, int prec, int type)
8867{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008868 char *buf;
8869 int len;
8870 PyObject *str; /* temporary string object. */
8871 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008872
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8874 if (!str)
8875 return NULL;
8876 result = PyUnicode_FromStringAndSize(buf, len);
8877 Py_DECREF(str);
8878 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008879}
8880
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881static int
8882formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008883 size_t buflen,
8884 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008886 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008887 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 if (PyUnicode_GET_SIZE(v) == 1) {
8889 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8890 buf[1] = '\0';
8891 return 1;
8892 }
8893#ifndef Py_UNICODE_WIDE
8894 if (PyUnicode_GET_SIZE(v) == 2) {
8895 /* Decode a valid surrogate pair */
8896 int c0 = PyUnicode_AS_UNICODE(v)[0];
8897 int c1 = PyUnicode_AS_UNICODE(v)[1];
8898 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8899 0xDC00 <= c1 && c1 <= 0xDFFF) {
8900 buf[0] = c0;
8901 buf[1] = c1;
8902 buf[2] = '\0';
8903 return 2;
8904 }
8905 }
8906#endif
8907 goto onError;
8908 }
8909 else {
8910 /* Integer input truncated to a character */
8911 long x;
8912 x = PyLong_AsLong(v);
8913 if (x == -1 && PyErr_Occurred())
8914 goto onError;
8915
8916 if (x < 0 || x > 0x10ffff) {
8917 PyErr_SetString(PyExc_OverflowError,
8918 "%c arg not in range(0x110000)");
8919 return -1;
8920 }
8921
8922#ifndef Py_UNICODE_WIDE
8923 if (x > 0xffff) {
8924 x -= 0x10000;
8925 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8926 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8927 return 2;
8928 }
8929#endif
8930 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008931 buf[1] = '\0';
8932 return 1;
8933 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008934
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008936 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008938 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939}
8940
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008941/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008942 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008943*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008944#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948{
8949 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008950 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 int args_owned = 0;
8952 PyUnicodeObject *result = NULL;
8953 PyObject *dict = NULL;
8954 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008955
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 PyErr_BadInternalCall();
8958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 }
8960 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008961 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 fmt = PyUnicode_AS_UNICODE(uformat);
8964 fmtcnt = PyUnicode_GET_SIZE(uformat);
8965
8966 reslen = rescnt = fmtcnt + 100;
8967 result = _PyUnicode_New(reslen);
8968 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 res = PyUnicode_AS_UNICODE(result);
8971
8972 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 arglen = PyTuple_Size(args);
8974 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
8976 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 arglen = -1;
8978 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008980 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008981 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
8984 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 if (*fmt != '%') {
8986 if (--rescnt < 0) {
8987 rescnt = fmtcnt + 100;
8988 reslen += rescnt;
8989 if (_PyUnicode_Resize(&result, reslen) < 0)
8990 goto onError;
8991 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8992 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008993 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008995 }
8996 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 /* Got a format specifier */
8998 int flags = 0;
8999 Py_ssize_t width = -1;
9000 int prec = -1;
9001 Py_UNICODE c = '\0';
9002 Py_UNICODE fill;
9003 int isnumok;
9004 PyObject *v = NULL;
9005 PyObject *temp = NULL;
9006 Py_UNICODE *pbuf;
9007 Py_UNICODE sign;
9008 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009009 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 fmt++;
9012 if (*fmt == '(') {
9013 Py_UNICODE *keystart;
9014 Py_ssize_t keylen;
9015 PyObject *key;
9016 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009017
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 if (dict == NULL) {
9019 PyErr_SetString(PyExc_TypeError,
9020 "format requires a mapping");
9021 goto onError;
9022 }
9023 ++fmt;
9024 --fmtcnt;
9025 keystart = fmt;
9026 /* Skip over balanced parentheses */
9027 while (pcount > 0 && --fmtcnt >= 0) {
9028 if (*fmt == ')')
9029 --pcount;
9030 else if (*fmt == '(')
9031 ++pcount;
9032 fmt++;
9033 }
9034 keylen = fmt - keystart - 1;
9035 if (fmtcnt < 0 || pcount > 0) {
9036 PyErr_SetString(PyExc_ValueError,
9037 "incomplete format key");
9038 goto onError;
9039 }
9040#if 0
9041 /* keys are converted to strings using UTF-8 and
9042 then looked up since Python uses strings to hold
9043 variables names etc. in its namespaces and we
9044 wouldn't want to break common idioms. */
9045 key = PyUnicode_EncodeUTF8(keystart,
9046 keylen,
9047 NULL);
9048#else
9049 key = PyUnicode_FromUnicode(keystart, keylen);
9050#endif
9051 if (key == NULL)
9052 goto onError;
9053 if (args_owned) {
9054 Py_DECREF(args);
9055 args_owned = 0;
9056 }
9057 args = PyObject_GetItem(dict, key);
9058 Py_DECREF(key);
9059 if (args == NULL) {
9060 goto onError;
9061 }
9062 args_owned = 1;
9063 arglen = -1;
9064 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 while (--fmtcnt >= 0) {
9067 switch (c = *fmt++) {
9068 case '-': flags |= F_LJUST; continue;
9069 case '+': flags |= F_SIGN; continue;
9070 case ' ': flags |= F_BLANK; continue;
9071 case '#': flags |= F_ALT; continue;
9072 case '0': flags |= F_ZERO; continue;
9073 }
9074 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 if (c == '*') {
9077 v = getnextarg(args, arglen, &argidx);
9078 if (v == NULL)
9079 goto onError;
9080 if (!PyLong_Check(v)) {
9081 PyErr_SetString(PyExc_TypeError,
9082 "* wants int");
9083 goto onError;
9084 }
9085 width = PyLong_AsLong(v);
9086 if (width == -1 && PyErr_Occurred())
9087 goto onError;
9088 if (width < 0) {
9089 flags |= F_LJUST;
9090 width = -width;
9091 }
9092 if (--fmtcnt >= 0)
9093 c = *fmt++;
9094 }
9095 else if (c >= '0' && c <= '9') {
9096 width = c - '0';
9097 while (--fmtcnt >= 0) {
9098 c = *fmt++;
9099 if (c < '0' || c > '9')
9100 break;
9101 if ((width*10) / 10 != width) {
9102 PyErr_SetString(PyExc_ValueError,
9103 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009104 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 }
9106 width = width*10 + (c - '0');
9107 }
9108 }
9109 if (c == '.') {
9110 prec = 0;
9111 if (--fmtcnt >= 0)
9112 c = *fmt++;
9113 if (c == '*') {
9114 v = getnextarg(args, arglen, &argidx);
9115 if (v == NULL)
9116 goto onError;
9117 if (!PyLong_Check(v)) {
9118 PyErr_SetString(PyExc_TypeError,
9119 "* wants int");
9120 goto onError;
9121 }
9122 prec = PyLong_AsLong(v);
9123 if (prec == -1 && PyErr_Occurred())
9124 goto onError;
9125 if (prec < 0)
9126 prec = 0;
9127 if (--fmtcnt >= 0)
9128 c = *fmt++;
9129 }
9130 else if (c >= '0' && c <= '9') {
9131 prec = c - '0';
9132 while (--fmtcnt >= 0) {
9133 c = Py_CHARMASK(*fmt++);
9134 if (c < '0' || c > '9')
9135 break;
9136 if ((prec*10) / 10 != prec) {
9137 PyErr_SetString(PyExc_ValueError,
9138 "prec too big");
9139 goto onError;
9140 }
9141 prec = prec*10 + (c - '0');
9142 }
9143 }
9144 } /* prec */
9145 if (fmtcnt >= 0) {
9146 if (c == 'h' || c == 'l' || c == 'L') {
9147 if (--fmtcnt >= 0)
9148 c = *fmt++;
9149 }
9150 }
9151 if (fmtcnt < 0) {
9152 PyErr_SetString(PyExc_ValueError,
9153 "incomplete format");
9154 goto onError;
9155 }
9156 if (c != '%') {
9157 v = getnextarg(args, arglen, &argidx);
9158 if (v == NULL)
9159 goto onError;
9160 }
9161 sign = 0;
9162 fill = ' ';
9163 switch (c) {
9164
9165 case '%':
9166 pbuf = formatbuf;
9167 /* presume that buffer length is at least 1 */
9168 pbuf[0] = '%';
9169 len = 1;
9170 break;
9171
9172 case 's':
9173 case 'r':
9174 case 'a':
9175 if (PyUnicode_Check(v) && c == 's') {
9176 temp = v;
9177 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009178 }
9179 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 if (c == 's')
9181 temp = PyObject_Str(v);
9182 else if (c == 'r')
9183 temp = PyObject_Repr(v);
9184 else
9185 temp = PyObject_ASCII(v);
9186 if (temp == NULL)
9187 goto onError;
9188 if (PyUnicode_Check(temp))
9189 /* nothing to do */;
9190 else {
9191 Py_DECREF(temp);
9192 PyErr_SetString(PyExc_TypeError,
9193 "%s argument has non-string str()");
9194 goto onError;
9195 }
9196 }
9197 pbuf = PyUnicode_AS_UNICODE(temp);
9198 len = PyUnicode_GET_SIZE(temp);
9199 if (prec >= 0 && len > prec)
9200 len = prec;
9201 break;
9202
9203 case 'i':
9204 case 'd':
9205 case 'u':
9206 case 'o':
9207 case 'x':
9208 case 'X':
9209 if (c == 'i')
9210 c = 'd';
9211 isnumok = 0;
9212 if (PyNumber_Check(v)) {
9213 PyObject *iobj=NULL;
9214
9215 if (PyLong_Check(v)) {
9216 iobj = v;
9217 Py_INCREF(iobj);
9218 }
9219 else {
9220 iobj = PyNumber_Long(v);
9221 }
9222 if (iobj!=NULL) {
9223 if (PyLong_Check(iobj)) {
9224 isnumok = 1;
9225 temp = formatlong(iobj, flags, prec, c);
9226 Py_DECREF(iobj);
9227 if (!temp)
9228 goto onError;
9229 pbuf = PyUnicode_AS_UNICODE(temp);
9230 len = PyUnicode_GET_SIZE(temp);
9231 sign = 1;
9232 }
9233 else {
9234 Py_DECREF(iobj);
9235 }
9236 }
9237 }
9238 if (!isnumok) {
9239 PyErr_Format(PyExc_TypeError,
9240 "%%%c format: a number is required, "
9241 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9242 goto onError;
9243 }
9244 if (flags & F_ZERO)
9245 fill = '0';
9246 break;
9247
9248 case 'e':
9249 case 'E':
9250 case 'f':
9251 case 'F':
9252 case 'g':
9253 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009254 temp = formatfloat(v, flags, prec, c);
9255 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009257 pbuf = PyUnicode_AS_UNICODE(temp);
9258 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 sign = 1;
9260 if (flags & F_ZERO)
9261 fill = '0';
9262 break;
9263
9264 case 'c':
9265 pbuf = formatbuf;
9266 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9267 if (len < 0)
9268 goto onError;
9269 break;
9270
9271 default:
9272 PyErr_Format(PyExc_ValueError,
9273 "unsupported format character '%c' (0x%x) "
9274 "at index %zd",
9275 (31<=c && c<=126) ? (char)c : '?',
9276 (int)c,
9277 (Py_ssize_t)(fmt - 1 -
9278 PyUnicode_AS_UNICODE(uformat)));
9279 goto onError;
9280 }
9281 if (sign) {
9282 if (*pbuf == '-' || *pbuf == '+') {
9283 sign = *pbuf++;
9284 len--;
9285 }
9286 else if (flags & F_SIGN)
9287 sign = '+';
9288 else if (flags & F_BLANK)
9289 sign = ' ';
9290 else
9291 sign = 0;
9292 }
9293 if (width < len)
9294 width = len;
9295 if (rescnt - (sign != 0) < width) {
9296 reslen -= rescnt;
9297 rescnt = width + fmtcnt + 100;
9298 reslen += rescnt;
9299 if (reslen < 0) {
9300 Py_XDECREF(temp);
9301 PyErr_NoMemory();
9302 goto onError;
9303 }
9304 if (_PyUnicode_Resize(&result, reslen) < 0) {
9305 Py_XDECREF(temp);
9306 goto onError;
9307 }
9308 res = PyUnicode_AS_UNICODE(result)
9309 + reslen - rescnt;
9310 }
9311 if (sign) {
9312 if (fill != ' ')
9313 *res++ = sign;
9314 rescnt--;
9315 if (width > len)
9316 width--;
9317 }
9318 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9319 assert(pbuf[0] == '0');
9320 assert(pbuf[1] == c);
9321 if (fill != ' ') {
9322 *res++ = *pbuf++;
9323 *res++ = *pbuf++;
9324 }
9325 rescnt -= 2;
9326 width -= 2;
9327 if (width < 0)
9328 width = 0;
9329 len -= 2;
9330 }
9331 if (width > len && !(flags & F_LJUST)) {
9332 do {
9333 --rescnt;
9334 *res++ = fill;
9335 } while (--width > len);
9336 }
9337 if (fill == ' ') {
9338 if (sign)
9339 *res++ = sign;
9340 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9341 assert(pbuf[0] == '0');
9342 assert(pbuf[1] == c);
9343 *res++ = *pbuf++;
9344 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345 }
9346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 Py_UNICODE_COPY(res, pbuf, len);
9348 res += len;
9349 rescnt -= len;
9350 while (--width >= len) {
9351 --rescnt;
9352 *res++ = ' ';
9353 }
9354 if (dict && (argidx < arglen) && c != '%') {
9355 PyErr_SetString(PyExc_TypeError,
9356 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009357 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 goto onError;
9359 }
9360 Py_XDECREF(temp);
9361 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 } /* until end */
9363 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 PyErr_SetString(PyExc_TypeError,
9365 "not all arguments converted during string formatting");
9366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 }
9368
Thomas Woutersa96affe2006-03-12 00:29:36 +00009369 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 }
9374 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 return (PyObject *)result;
9376
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 Py_XDECREF(result);
9379 Py_DECREF(uformat);
9380 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
9383 return NULL;
9384}
9385
Jeremy Hylton938ace62002-07-17 16:30:39 +00009386static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009387unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9388
Tim Peters6d6c1a32001-08-02 04:15:00 +00009389static PyObject *
9390unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9391{
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009393 static char *kwlist[] = {"object", "encoding", "errors", 0};
9394 char *encoding = NULL;
9395 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009396
Benjamin Peterson14339b62009-01-31 16:36:08 +00009397 if (type != &PyUnicode_Type)
9398 return unicode_subtype_new(type, args, kwds);
9399 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009401 return NULL;
9402 if (x == NULL)
9403 return (PyObject *)_PyUnicode_New(0);
9404 if (encoding == NULL && errors == NULL)
9405 return PyObject_Str(x);
9406 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009408}
9409
Guido van Rossume023fe02001-08-30 03:12:59 +00009410static PyObject *
9411unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9412{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 PyUnicodeObject *tmp, *pnew;
9414 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009415
Benjamin Peterson14339b62009-01-31 16:36:08 +00009416 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9417 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9418 if (tmp == NULL)
9419 return NULL;
9420 assert(PyUnicode_Check(tmp));
9421 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9422 if (pnew == NULL) {
9423 Py_DECREF(tmp);
9424 return NULL;
9425 }
9426 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9427 if (pnew->str == NULL) {
9428 _Py_ForgetReference((PyObject *)pnew);
9429 PyObject_Del(pnew);
9430 Py_DECREF(tmp);
9431 return PyErr_NoMemory();
9432 }
9433 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9434 pnew->length = n;
9435 pnew->hash = tmp->hash;
9436 Py_DECREF(tmp);
9437 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009438}
9439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009440PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009441 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009442\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009443Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009444encoding defaults to the current default string encoding.\n\
9445errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009446
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009447static PyObject *unicode_iter(PyObject *seq);
9448
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009450 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009451 "str", /* tp_name */
9452 sizeof(PyUnicodeObject), /* tp_size */
9453 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009455 (destructor)unicode_dealloc, /* tp_dealloc */
9456 0, /* tp_print */
9457 0, /* tp_getattr */
9458 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009459 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009460 unicode_repr, /* tp_repr */
9461 &unicode_as_number, /* tp_as_number */
9462 &unicode_as_sequence, /* tp_as_sequence */
9463 &unicode_as_mapping, /* tp_as_mapping */
9464 (hashfunc) unicode_hash, /* tp_hash*/
9465 0, /* tp_call*/
9466 (reprfunc) unicode_str, /* tp_str */
9467 PyObject_GenericGetAttr, /* tp_getattro */
9468 0, /* tp_setattro */
9469 0, /* tp_as_buffer */
9470 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009472 unicode_doc, /* tp_doc */
9473 0, /* tp_traverse */
9474 0, /* tp_clear */
9475 PyUnicode_RichCompare, /* tp_richcompare */
9476 0, /* tp_weaklistoffset */
9477 unicode_iter, /* tp_iter */
9478 0, /* tp_iternext */
9479 unicode_methods, /* tp_methods */
9480 0, /* tp_members */
9481 0, /* tp_getset */
9482 &PyBaseObject_Type, /* tp_base */
9483 0, /* tp_dict */
9484 0, /* tp_descr_get */
9485 0, /* tp_descr_set */
9486 0, /* tp_dictoffset */
9487 0, /* tp_init */
9488 0, /* tp_alloc */
9489 unicode_new, /* tp_new */
9490 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491};
9492
9493/* Initialize the Unicode implementation */
9494
Thomas Wouters78890102000-07-22 19:25:51 +00009495void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009497 int i;
9498
Thomas Wouters477c8d52006-05-27 19:21:47 +00009499 /* XXX - move this array to unicodectype.c ? */
9500 Py_UNICODE linebreak[] = {
9501 0x000A, /* LINE FEED */
9502 0x000D, /* CARRIAGE RETURN */
9503 0x001C, /* FILE SEPARATOR */
9504 0x001D, /* GROUP SEPARATOR */
9505 0x001E, /* RECORD SEPARATOR */
9506 0x0085, /* NEXT LINE */
9507 0x2028, /* LINE SEPARATOR */
9508 0x2029, /* PARAGRAPH SEPARATOR */
9509 };
9510
Fred Drakee4315f52000-05-09 19:53:39 +00009511 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009512 free_list = NULL;
9513 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009515 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009517
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009518 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009520 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009522
9523 /* initialize the linebreak bloom filter */
9524 bloom_linebreak = make_bloom_mask(
9525 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9526 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009527
9528 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529}
9530
9531/* Finalize the Unicode implementation */
9532
Christian Heimesa156e092008-02-16 07:38:31 +00009533int
9534PyUnicode_ClearFreeList(void)
9535{
9536 int freelist_size = numfree;
9537 PyUnicodeObject *u;
9538
9539 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 PyUnicodeObject *v = u;
9541 u = *(PyUnicodeObject **)u;
9542 if (v->str)
9543 PyObject_DEL(v->str);
9544 Py_XDECREF(v->defenc);
9545 PyObject_Del(v);
9546 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009547 }
9548 free_list = NULL;
9549 assert(numfree == 0);
9550 return freelist_size;
9551}
9552
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553void
Thomas Wouters78890102000-07-22 19:25:51 +00009554_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009556 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009558 Py_XDECREF(unicode_empty);
9559 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009560
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009561 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 if (unicode_latin1[i]) {
9563 Py_DECREF(unicode_latin1[i]);
9564 unicode_latin1[i] = NULL;
9565 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009566 }
Christian Heimesa156e092008-02-16 07:38:31 +00009567 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009569
Walter Dörwald16807132007-05-25 13:52:07 +00009570void
9571PyUnicode_InternInPlace(PyObject **p)
9572{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009573 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9574 PyObject *t;
9575 if (s == NULL || !PyUnicode_Check(s))
9576 Py_FatalError(
9577 "PyUnicode_InternInPlace: unicode strings only please!");
9578 /* If it's a subclass, we don't really know what putting
9579 it in the interned dict might do. */
9580 if (!PyUnicode_CheckExact(s))
9581 return;
9582 if (PyUnicode_CHECK_INTERNED(s))
9583 return;
9584 if (interned == NULL) {
9585 interned = PyDict_New();
9586 if (interned == NULL) {
9587 PyErr_Clear(); /* Don't leave an exception */
9588 return;
9589 }
9590 }
9591 /* It might be that the GetItem call fails even
9592 though the key is present in the dictionary,
9593 namely when this happens during a stack overflow. */
9594 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009596 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009597
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 if (t) {
9599 Py_INCREF(t);
9600 Py_DECREF(*p);
9601 *p = t;
9602 return;
9603 }
Walter Dörwald16807132007-05-25 13:52:07 +00009604
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 PyThreadState_GET()->recursion_critical = 1;
9606 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9607 PyErr_Clear();
9608 PyThreadState_GET()->recursion_critical = 0;
9609 return;
9610 }
9611 PyThreadState_GET()->recursion_critical = 0;
9612 /* The two references in interned are not counted by refcnt.
9613 The deallocator will take care of this */
9614 Py_REFCNT(s) -= 2;
9615 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009616}
9617
9618void
9619PyUnicode_InternImmortal(PyObject **p)
9620{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 PyUnicode_InternInPlace(p);
9622 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9623 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9624 Py_INCREF(*p);
9625 }
Walter Dörwald16807132007-05-25 13:52:07 +00009626}
9627
9628PyObject *
9629PyUnicode_InternFromString(const char *cp)
9630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 PyObject *s = PyUnicode_FromString(cp);
9632 if (s == NULL)
9633 return NULL;
9634 PyUnicode_InternInPlace(&s);
9635 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009636}
9637
9638void _Py_ReleaseInternedUnicodeStrings(void)
9639{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 PyObject *keys;
9641 PyUnicodeObject *s;
9642 Py_ssize_t i, n;
9643 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009644
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 if (interned == NULL || !PyDict_Check(interned))
9646 return;
9647 keys = PyDict_Keys(interned);
9648 if (keys == NULL || !PyList_Check(keys)) {
9649 PyErr_Clear();
9650 return;
9651 }
Walter Dörwald16807132007-05-25 13:52:07 +00009652
Benjamin Peterson14339b62009-01-31 16:36:08 +00009653 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9654 detector, interned unicode strings are not forcibly deallocated;
9655 rather, we give them their stolen references back, and then clear
9656 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009657
Benjamin Peterson14339b62009-01-31 16:36:08 +00009658 n = PyList_GET_SIZE(keys);
9659 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009661 for (i = 0; i < n; i++) {
9662 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9663 switch (s->state) {
9664 case SSTATE_NOT_INTERNED:
9665 /* XXX Shouldn't happen */
9666 break;
9667 case SSTATE_INTERNED_IMMORTAL:
9668 Py_REFCNT(s) += 1;
9669 immortal_size += s->length;
9670 break;
9671 case SSTATE_INTERNED_MORTAL:
9672 Py_REFCNT(s) += 2;
9673 mortal_size += s->length;
9674 break;
9675 default:
9676 Py_FatalError("Inconsistent interned string state.");
9677 }
9678 s->state = SSTATE_NOT_INTERNED;
9679 }
9680 fprintf(stderr, "total size of all interned strings: "
9681 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9682 "mortal/immortal\n", mortal_size, immortal_size);
9683 Py_DECREF(keys);
9684 PyDict_Clear(interned);
9685 Py_DECREF(interned);
9686 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009687}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009688
9689
9690/********************* Unicode Iterator **************************/
9691
9692typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 PyObject_HEAD
9694 Py_ssize_t it_index;
9695 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009696} unicodeiterobject;
9697
9698static void
9699unicodeiter_dealloc(unicodeiterobject *it)
9700{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009701 _PyObject_GC_UNTRACK(it);
9702 Py_XDECREF(it->it_seq);
9703 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009704}
9705
9706static int
9707unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009709 Py_VISIT(it->it_seq);
9710 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009711}
9712
9713static PyObject *
9714unicodeiter_next(unicodeiterobject *it)
9715{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 PyUnicodeObject *seq;
9717 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009718
Benjamin Peterson14339b62009-01-31 16:36:08 +00009719 assert(it != NULL);
9720 seq = it->it_seq;
9721 if (seq == NULL)
9722 return NULL;
9723 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9726 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009728 if (item != NULL)
9729 ++it->it_index;
9730 return item;
9731 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009732
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 Py_DECREF(seq);
9734 it->it_seq = NULL;
9735 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009736}
9737
9738static PyObject *
9739unicodeiter_len(unicodeiterobject *it)
9740{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 Py_ssize_t len = 0;
9742 if (it->it_seq)
9743 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9744 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009745}
9746
9747PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9748
9749static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009750 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009752 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009753};
9754
9755PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9757 "str_iterator", /* tp_name */
9758 sizeof(unicodeiterobject), /* tp_basicsize */
9759 0, /* tp_itemsize */
9760 /* methods */
9761 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9762 0, /* tp_print */
9763 0, /* tp_getattr */
9764 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009765 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 0, /* tp_repr */
9767 0, /* tp_as_number */
9768 0, /* tp_as_sequence */
9769 0, /* tp_as_mapping */
9770 0, /* tp_hash */
9771 0, /* tp_call */
9772 0, /* tp_str */
9773 PyObject_GenericGetAttr, /* tp_getattro */
9774 0, /* tp_setattro */
9775 0, /* tp_as_buffer */
9776 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9777 0, /* tp_doc */
9778 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9779 0, /* tp_clear */
9780 0, /* tp_richcompare */
9781 0, /* tp_weaklistoffset */
9782 PyObject_SelfIter, /* tp_iter */
9783 (iternextfunc)unicodeiter_next, /* tp_iternext */
9784 unicodeiter_methods, /* tp_methods */
9785 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009786};
9787
9788static PyObject *
9789unicode_iter(PyObject *seq)
9790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009792
Benjamin Peterson14339b62009-01-31 16:36:08 +00009793 if (!PyUnicode_Check(seq)) {
9794 PyErr_BadInternalCall();
9795 return NULL;
9796 }
9797 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9798 if (it == NULL)
9799 return NULL;
9800 it->it_index = 0;
9801 Py_INCREF(seq);
9802 it->it_seq = (PyUnicodeObject *)seq;
9803 _PyObject_GC_TRACK(it);
9804 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009805}
9806
Martin v. Löwis5b222132007-06-10 09:51:05 +00009807size_t
9808Py_UNICODE_strlen(const Py_UNICODE *u)
9809{
9810 int res = 0;
9811 while(*u++)
9812 res++;
9813 return res;
9814}
9815
9816Py_UNICODE*
9817Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9818{
9819 Py_UNICODE *u = s1;
9820 while ((*u++ = *s2++));
9821 return s1;
9822}
9823
9824Py_UNICODE*
9825Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9826{
9827 Py_UNICODE *u = s1;
9828 while ((*u++ = *s2++))
9829 if (n-- == 0)
9830 break;
9831 return s1;
9832}
9833
9834int
9835Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9836{
9837 while (*s1 && *s2 && *s1 == *s2)
9838 s1++, s2++;
9839 if (*s1 && *s2)
9840 return (*s1 < *s2) ? -1 : +1;
9841 if (*s1)
9842 return 1;
9843 if (*s2)
9844 return -1;
9845 return 0;
9846}
9847
9848Py_UNICODE*
9849Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9850{
9851 const Py_UNICODE *p;
9852 for (p = s; *p; p++)
9853 if (*p == c)
9854 return (Py_UNICODE*)p;
9855 return NULL;
9856}
9857
9858
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009859#ifdef __cplusplus
9860}
9861#endif
9862
9863
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009864/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 Local variables:
9866 c-basic-offset: 4
9867 indent-tabs-mode: nil
9868 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009869*/