blob: cc70bad825a56b03fb86d82f9e80a45006fb9f82 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000726 /* step 1: count the number of %S/%R/%A format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
728 * these objects once during step 3 and put the result in
Benjamin Peterson29060642009-01-31 22:14:21 +0000729 an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
731 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
732 ++callcount;
733 }
734 /* step 2: allocate memory for the results of
735 * PyObject_Str()/PyObject_Repr() calls */
736 if (callcount) {
737 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
738 if (!callresults) {
739 PyErr_NoMemory();
740 return NULL;
741 }
742 callresult = callresults;
743 }
744 /* step 3: figure out how large a buffer we need */
745 for (f = format; *f; f++) {
746 if (*f == '%') {
747 const char* p = f;
748 width = 0;
749 while (ISDIGIT((unsigned)*f))
750 width = (width*10) + *f++ - '0';
751 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
752 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
755 * they don't affect the amount of space we reserve.
756 */
757 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000758 (f[1] == 'd' || f[1] == 'u'))
759 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 switch (*f) {
762 case 'c':
763 (void)va_arg(count, int);
764 /* fall through... */
765 case '%':
766 n++;
767 break;
768 case 'd': case 'u': case 'i': case 'x':
769 (void) va_arg(count, int);
770 /* 20 bytes is enough to hold a 64-bit
771 integer. Decimal takes the most space.
772 This isn't enough for octal.
773 If a width is specified we need more
774 (which we allocate later). */
775 if (width < 20)
776 width = 20;
777 n += width;
778 if (abuffersize < width)
779 abuffersize = width;
780 break;
781 case 's':
782 {
783 /* UTF-8 */
784 unsigned char*s;
785 s = va_arg(count, unsigned char*);
786 while (*s) {
787 if (*s < 128) {
788 n++; s++;
789 } else if (*s < 0xc0) {
790 /* invalid UTF-8 */
791 n++; s++;
792 } else if (*s < 0xc0) {
793 n++;
794 s++; if(!*s)break;
795 s++;
796 } else if (*s < 0xe0) {
797 n++;
798 s++; if(!*s)break;
799 s++; if(!*s)break;
800 s++;
801 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000802#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 n++;
Benjamin Peterson29060642009-01-31 22:14:21 +0000804#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000805 n+=2;
Benjamin Peterson29060642009-01-31 22:14:21 +0000806#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000807 s++; if(!*s)break;
808 s++; if(!*s)break;
809 s++; if(!*s)break;
810 s++;
811 }
812 }
813 break;
814 }
815 case 'U':
816 {
817 PyObject *obj = va_arg(count, PyObject *);
818 assert(obj && PyUnicode_Check(obj));
819 n += PyUnicode_GET_SIZE(obj);
820 break;
821 }
822 case 'V':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 const char *str = va_arg(count, const char *);
826 assert(obj || str);
827 assert(!obj || PyUnicode_Check(obj));
828 if (obj)
829 n += PyUnicode_GET_SIZE(obj);
830 else
831 n += strlen(str);
832 break;
833 }
834 case 'S':
835 {
836 PyObject *obj = va_arg(count, PyObject *);
837 PyObject *str;
838 assert(obj);
839 str = PyObject_Str(obj);
840 if (!str)
841 goto fail;
842 n += PyUnicode_GET_SIZE(str);
843 /* Remember the str and switch to the next slot */
844 *callresult++ = str;
845 break;
846 }
847 case 'R':
848 {
849 PyObject *obj = va_arg(count, PyObject *);
850 PyObject *repr;
851 assert(obj);
852 repr = PyObject_Repr(obj);
853 if (!repr)
854 goto fail;
855 n += PyUnicode_GET_SIZE(repr);
856 /* Remember the repr and switch to the next slot */
857 *callresult++ = repr;
858 break;
859 }
860 case 'A':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 PyObject *ascii;
864 assert(obj);
865 ascii = PyObject_ASCII(obj);
866 if (!ascii)
867 goto fail;
868 n += PyUnicode_GET_SIZE(ascii);
869 /* Remember the repr and switch to the next slot */
870 *callresult++ = ascii;
871 break;
872 }
873 case 'p':
874 (void) va_arg(count, int);
875 /* maximum 64-bit pointer representation:
876 * 0xffffffffffffffff
877 * so 19 characters is enough.
878 * XXX I count 18 -- what's the extra for?
879 */
880 n += 19;
881 break;
882 default:
883 /* if we stumble upon an unknown
884 formatting code, copy the rest of
885 the format string to the output
886 string. (we cannot just skip the
887 code, since there's no way to know
888 what's in the argument list) */
889 n += strlen(p);
890 goto expand;
891 }
892 } else
893 n++;
894 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000895 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000896 if (abuffersize > 20) {
897 abuffer = PyObject_Malloc(abuffersize);
898 if (!abuffer) {
899 PyErr_NoMemory();
900 goto fail;
901 }
902 realbuffer = abuffer;
903 }
904 else
905 realbuffer = buffer;
906 /* step 4: fill the buffer */
907 /* Since we've analyzed how much space we need for the worst case,
908 we don't have to resize the string.
909 There can be no errors beyond this point. */
910 string = PyUnicode_FromUnicode(NULL, n);
911 if (!string)
912 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000913
Benjamin Peterson14339b62009-01-31 16:36:08 +0000914 s = PyUnicode_AS_UNICODE(string);
915 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916
Benjamin Peterson14339b62009-01-31 16:36:08 +0000917 for (f = format; *f; f++) {
918 if (*f == '%') {
919 const char* p = f++;
920 int longflag = 0;
921 int size_tflag = 0;
922 zeropad = (*f == '0');
923 /* parse the width.precision part */
924 width = 0;
925 while (ISDIGIT((unsigned)*f))
926 width = (width*10) + *f++ - '0';
927 precision = 0;
928 if (*f == '.') {
929 f++;
930 while (ISDIGIT((unsigned)*f))
931 precision = (precision*10) + *f++ - '0';
932 }
933 /* handle the long flag, but only for %ld and %lu.
934 others can be added when necessary. */
935 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
936 longflag = 1;
937 ++f;
938 }
939 /* handle the size_t flag. */
940 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
941 size_tflag = 1;
942 ++f;
943 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000944
Benjamin Peterson14339b62009-01-31 16:36:08 +0000945 switch (*f) {
946 case 'c':
947 *s++ = va_arg(vargs, int);
948 break;
949 case 'd':
950 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
951 if (longflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, long));
953 else if (size_tflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
955 else
956 sprintf(realbuffer, fmt, va_arg(vargs, int));
957 appendstring(realbuffer);
958 break;
959 case 'u':
960 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
961 if (longflag)
962 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
963 else if (size_tflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
965 else
966 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
967 appendstring(realbuffer);
968 break;
969 case 'i':
970 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
971 sprintf(realbuffer, fmt, va_arg(vargs, int));
972 appendstring(realbuffer);
973 break;
974 case 'x':
975 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
976 sprintf(realbuffer, fmt, va_arg(vargs, int));
977 appendstring(realbuffer);
978 break;
979 case 's':
980 {
981 /* Parameter must be UTF-8 encoded.
982 In case of encoding errors, use
983 the replacement character. */
984 PyObject *u;
985 p = va_arg(vargs, char*);
986 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Peterson29060642009-01-31 22:14:21 +0000987 "replace");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000988 if (!u)
989 goto fail;
990 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Peterson29060642009-01-31 22:14:21 +0000991 PyUnicode_GET_SIZE(u));
Benjamin Peterson14339b62009-01-31 16:36:08 +0000992 s += PyUnicode_GET_SIZE(u);
993 Py_DECREF(u);
994 break;
995 }
996 case 'U':
997 {
998 PyObject *obj = va_arg(vargs, PyObject *);
999 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1000 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1001 s += size;
1002 break;
1003 }
1004 case 'V':
1005 {
1006 PyObject *obj = va_arg(vargs, PyObject *);
1007 const char *str = va_arg(vargs, const char *);
1008 if (obj) {
1009 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1010 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1011 s += size;
1012 } else {
1013 appendstring(str);
1014 }
1015 break;
1016 }
1017 case 'S':
1018 case 'R':
1019 {
1020 Py_UNICODE *ucopy;
1021 Py_ssize_t usize;
1022 Py_ssize_t upos;
1023 /* unused, since we already have the result */
1024 (void) va_arg(vargs, PyObject *);
1025 ucopy = PyUnicode_AS_UNICODE(*callresult);
1026 usize = PyUnicode_GET_SIZE(*callresult);
1027 for (upos = 0; upos<usize;)
1028 *s++ = ucopy[upos++];
1029 /* We're done with the unicode()/repr() => forget it */
1030 Py_DECREF(*callresult);
1031 /* switch to next unicode()/repr() result */
1032 ++callresult;
1033 break;
1034 }
1035 case 'p':
1036 sprintf(buffer, "%p", va_arg(vargs, void*));
1037 /* %p is ill-defined: ensure leading 0x. */
1038 if (buffer[1] == 'X')
1039 buffer[1] = 'x';
1040 else if (buffer[1] != 'x') {
1041 memmove(buffer+2, buffer, strlen(buffer)+1);
1042 buffer[0] = '0';
1043 buffer[1] = 'x';
1044 }
1045 appendstring(buffer);
1046 break;
1047 case '%':
1048 *s++ = '%';
1049 break;
1050 default:
1051 appendstring(p);
1052 goto end;
1053 }
1054 } else
1055 *s++ = *f;
1056 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001057
Benjamin Peterson29060642009-01-31 22:14:21 +00001058 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 if (callresults)
1060 PyObject_Free(callresults);
1061 if (abuffer)
1062 PyObject_Free(abuffer);
1063 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1064 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001065 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (callresults) {
1067 PyObject **callresult2 = callresults;
1068 while (callresult2 < callresult) {
1069 Py_DECREF(*callresult2);
1070 ++callresult2;
1071 }
1072 PyObject_Free(callresults);
1073 }
1074 if (abuffer)
1075 PyObject_Free(abuffer);
1076 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001077}
1078
1079#undef appendstring
1080
1081PyObject *
1082PyUnicode_FromFormat(const char *format, ...)
1083{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001084 PyObject* ret;
1085 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001086
1087#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001088 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001089#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001091#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 ret = PyUnicode_FromFormatV(format, vargs);
1093 va_end(vargs);
1094 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001095}
1096
Martin v. Löwis18e16552006-02-15 17:27:45 +00001097Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 wchar_t *w,
1099 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100{
1101 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 PyErr_BadInternalCall();
1103 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001105
1106 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110#ifdef HAVE_USABLE_WCHAR_T
1111 memcpy(w, unicode->str, size * sizeof(wchar_t));
1112#else
1113 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001114 register Py_UNICODE *u;
1115 register Py_ssize_t i;
1116 u = PyUnicode_AS_UNICODE(unicode);
1117 for (i = size; i > 0; i--)
1118 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 }
1120#endif
1121
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001122 if (size > PyUnicode_GET_SIZE(unicode))
1123 return PyUnicode_GET_SIZE(unicode);
1124 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126}
1127
1128#endif
1129
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130PyObject *PyUnicode_FromOrdinal(int ordinal)
1131{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001132 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001133
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001134 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 PyErr_SetString(PyExc_ValueError,
1136 "chr() arg not in range(0x110000)");
1137 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001138 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001139
1140#ifndef Py_UNICODE_WIDE
1141 if (ordinal > 0xffff) {
1142 ordinal -= 0x10000;
1143 s[0] = 0xD800 | (ordinal >> 10);
1144 s[1] = 0xDC00 | (ordinal & 0x3FF);
1145 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001146 }
1147#endif
1148
Hye-Shik Chang40574832004-04-06 07:24:51 +00001149 s[0] = (Py_UNICODE)ordinal;
1150 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001151}
1152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153PyObject *PyUnicode_FromObject(register PyObject *obj)
1154{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001156 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001157 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 Py_INCREF(obj);
1159 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001160 }
1161 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 /* For a Unicode subtype that's not a Unicode object,
1163 return a true Unicode object with the same data. */
1164 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1165 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001166 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001167 PyErr_Format(PyExc_TypeError,
1168 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001169 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001170 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001171}
1172
1173PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001174 const char *encoding,
1175 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001176{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001177 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001178 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001179 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001182 PyErr_BadInternalCall();
1183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001185
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001186 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001187 PyErr_SetString(PyExc_TypeError,
1188 "decoding str is not supported");
1189 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001190 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001191
1192 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001193 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001194 s = PyBytes_AS_STRING(obj);
1195 len = PyBytes_GET_SIZE(obj);
1196 }
1197 else if (PyByteArray_Check(obj)) {
1198 s = PyByteArray_AS_STRING(obj);
1199 len = PyByteArray_GET_SIZE(obj);
1200 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001201 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 /* Overwrite the error message with something more useful in
1203 case of a TypeError. */
1204 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001205 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 "coercing to str: need string or buffer, "
1207 "%.80s found",
1208 Py_TYPE(obj)->tp_name);
1209 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001210 }
Tim Petersced69f82003-09-16 20:30:58 +00001211
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001212 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001214 Py_INCREF(unicode_empty);
1215 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
Tim Petersced69f82003-09-16 20:30:58 +00001217 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001219
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001220 return v;
1221
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224}
1225
1226PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001227 Py_ssize_t size,
1228 const char *encoding,
1229 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230{
1231 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001232 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001233 char lower[20]; /* Enough for any encoding name we recognize */
1234 char *l;
1235 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236
1237 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001238 encoding = PyUnicode_GetDefaultEncoding();
1239
1240 /* Convert encoding to lower case and replace '_' with '-' in order to
1241 catch e.g. UTF_8 */
1242 e = encoding;
1243 l = lower;
1244 while (*e && l < &lower[(sizeof lower) - 2]) {
1245 if (ISUPPER(*e)) {
1246 *l++ = TOLOWER(*e++);
1247 }
1248 else if (*e == '_') {
1249 *l++ = '-';
1250 e++;
1251 }
1252 else {
1253 *l++ = *e++;
1254 }
1255 }
1256 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001257
1258 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001259 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001261 else if ((strcmp(lower, "latin-1") == 0) ||
1262 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001263 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001264#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001265 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001266 return PyUnicode_DecodeMBCS(s, size, errors);
1267#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001268 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001269 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001270 else if (strcmp(lower, "utf-16") == 0)
1271 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1272 else if (strcmp(lower, "utf-32") == 0)
1273 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274
1275 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001276 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001277 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001278 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001279 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 if (buffer == NULL)
1281 goto onError;
1282 unicode = PyCodec_Decode(buffer, encoding, errors);
1283 if (unicode == NULL)
1284 goto onError;
1285 if (!PyUnicode_Check(unicode)) {
1286 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001287 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001288 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 Py_DECREF(unicode);
1290 goto onError;
1291 }
1292 Py_DECREF(buffer);
1293 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001294
Benjamin Peterson29060642009-01-31 22:14:21 +00001295 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 Py_XDECREF(buffer);
1297 return NULL;
1298}
1299
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001300PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1301 const char *encoding,
1302 const char *errors)
1303{
1304 PyObject *v;
1305
1306 if (!PyUnicode_Check(unicode)) {
1307 PyErr_BadArgument();
1308 goto onError;
1309 }
1310
1311 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001312 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001313
1314 /* Decode via the codec registry */
1315 v = PyCodec_Decode(unicode, encoding, errors);
1316 if (v == NULL)
1317 goto onError;
1318 return v;
1319
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001321 return NULL;
1322}
1323
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001324PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1325 const char *encoding,
1326 const char *errors)
1327{
1328 PyObject *v;
1329
1330 if (!PyUnicode_Check(unicode)) {
1331 PyErr_BadArgument();
1332 goto onError;
1333 }
1334
1335 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001337
1338 /* Decode via the codec registry */
1339 v = PyCodec_Decode(unicode, encoding, errors);
1340 if (v == NULL)
1341 goto onError;
1342 if (!PyUnicode_Check(v)) {
1343 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001344 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001345 Py_TYPE(v)->tp_name);
1346 Py_DECREF(v);
1347 goto onError;
1348 }
1349 return v;
1350
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001352 return NULL;
1353}
1354
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 Py_ssize_t size,
1357 const char *encoding,
1358 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
1360 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 unicode = PyUnicode_FromUnicode(s, size);
1363 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1366 Py_DECREF(unicode);
1367 return v;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Encode via the codec registry */
1385 v = PyCodec_Encode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001399
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 }
Fred Drakee4315f52000-05-09 19:53:39 +00001404
Tim Petersced69f82003-09-16 20:30:58 +00001405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001407
1408 /* Shortcuts for common default encodings */
1409 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 if (strcmp(encoding, "utf-8") == 0)
1411 return PyUnicode_AsUTF8String(unicode);
1412 else if (strcmp(encoding, "latin-1") == 0)
1413 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001414#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001415 else if (strcmp(encoding, "mbcs") == 0)
1416 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001417#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001418 else if (strcmp(encoding, "ascii") == 0)
1419 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001420 /* During bootstrap, we may need to find the encodings
1421 package, to load the file system encoding, and require the
1422 file system encoding in order to load the encodings
1423 package.
1424
1425 Break out of this dependency by assuming that the path to
1426 the encodings module is ASCII-only. XXX could try wcstombs
1427 instead, if the file system encoding is the locale's
1428 encoding. */
1429 else if (Py_FileSystemDefaultEncoding &&
1430 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1431 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001432 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434
1435 /* Encode via the codec registry */
1436 v = PyCodec_Encode(unicode, encoding, errors);
1437 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001438 return NULL;
1439
1440 /* The normal path */
1441 if (PyBytes_Check(v))
1442 return v;
1443
1444 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001445 if (PyByteArray_Check(v)) {
1446 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001447 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001448 PyOS_snprintf(msg, sizeof(msg),
1449 "encoder %s returned buffer instead of bytes",
1450 encoding);
1451 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001452 Py_DECREF(v);
1453 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001454 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001455
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001456 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1457 Py_DECREF(v);
1458 return b;
1459 }
1460
1461 PyErr_Format(PyExc_TypeError,
1462 "encoder did not return a bytes object (type=%.400s)",
1463 Py_TYPE(v)->tp_name);
1464 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465 return NULL;
1466}
1467
1468PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *v;
1473
1474 if (!PyUnicode_Check(unicode)) {
1475 PyErr_BadArgument();
1476 goto onError;
1477 }
1478
1479 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001481
1482 /* Encode via the codec registry */
1483 v = PyCodec_Encode(unicode, encoding, errors);
1484 if (v == NULL)
1485 goto onError;
1486 if (!PyUnicode_Check(v)) {
1487 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001488 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001489 Py_TYPE(v)->tp_name);
1490 Py_DECREF(v);
1491 goto onError;
1492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001494
Benjamin Peterson29060642009-01-31 22:14:21 +00001495 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return NULL;
1497}
1498
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001499PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001501{
1502 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001503 if (v)
1504 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001505 if (errors != NULL)
1506 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001507 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001510 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001511 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001512 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001513 return v;
1514}
1515
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001516PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001517PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001518 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001519 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1520}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521
Christian Heimes5894ba72007-11-04 11:43:14 +00001522PyObject*
1523PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1524{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001525 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1526 can be undefined. If it is case, decode using UTF-8. The following assumes
1527 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1528 bootstrapping process where the codecs aren't ready yet.
1529 */
1530 if (Py_FileSystemDefaultEncoding) {
1531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001532 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001533 return PyUnicode_DecodeMBCS(s, size, "replace");
1534 }
1535#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001536 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001537 return PyUnicode_DecodeUTF8(s, size, "replace");
1538 }
1539#endif
1540 return PyUnicode_Decode(s, size,
1541 Py_FileSystemDefaultEncoding,
1542 "replace");
1543 }
1544 else {
1545 return PyUnicode_DecodeUTF8(s, size, "replace");
1546 }
1547}
1548
Martin v. Löwis5b222132007-06-10 09:51:05 +00001549char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001550_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001551{
Christian Heimesf3863112007-11-22 07:46:41 +00001552 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 return NULL;
1556 }
Christian Heimesf3863112007-11-22 07:46:41 +00001557 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1558 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001559 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001560 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001561 *psize = PyBytes_GET_SIZE(bytes);
1562 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001563}
1564
1565char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001566_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001567{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001568 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001569}
1570
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1572{
1573 if (!PyUnicode_Check(unicode)) {
1574 PyErr_BadArgument();
1575 goto onError;
1576 }
1577 return PyUnicode_AS_UNICODE(unicode);
1578
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return NULL;
1581}
1582
Martin v. Löwis18e16552006-02-15 17:27:45 +00001583Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584{
1585 if (!PyUnicode_Check(unicode)) {
1586 PyErr_BadArgument();
1587 goto onError;
1588 }
1589 return PyUnicode_GET_SIZE(unicode);
1590
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 return -1;
1593}
1594
Thomas Wouters78890102000-07-22 19:25:51 +00001595const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001596{
1597 return unicode_default_encoding;
1598}
1599
1600int PyUnicode_SetDefaultEncoding(const char *encoding)
1601{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001602 if (strcmp(encoding, unicode_default_encoding) != 0) {
1603 PyErr_Format(PyExc_ValueError,
1604 "Can only set default encoding to %s",
1605 unicode_default_encoding);
1606 return -1;
1607 }
Fred Drakee4315f52000-05-09 19:53:39 +00001608 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001609}
1610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001611/* error handling callback helper:
1612 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001613 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 and adjust various state variables.
1615 return 0 on success, -1 on error
1616*/
1617
1618static
1619int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001620 const char *encoding, const char *reason,
1621 const char **input, const char **inend, Py_ssize_t *startinpos,
1622 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1623 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001625 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626
1627 PyObject *restuple = NULL;
1628 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001629 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001630 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001631 Py_ssize_t requiredsize;
1632 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001634 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 int res = -1;
1637
1638 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 *errorHandler = PyCodec_LookupError(errors);
1640 if (*errorHandler == NULL)
1641 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 }
1643
1644 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001645 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001646 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1647 if (*exceptionObject == NULL)
1648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 }
1650 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1652 goto onError;
1653 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1654 goto onError;
1655 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1656 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 }
1658
1659 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1660 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001661 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001663 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001664 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 }
1666 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001668
1669 /* Copy back the bytes variables, which might have been modified by the
1670 callback */
1671 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1672 if (!inputobj)
1673 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001674 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001675 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001676 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001677 *input = PyBytes_AS_STRING(inputobj);
1678 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001679 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001680 /* we can DECREF safely, as the exception has another reference,
1681 so the object won't go away. */
1682 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001685 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001686 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001687 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1688 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001689 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001690
1691 /* need more space? (at least enough for what we
1692 have+the replacement+the rest of the string (starting
1693 at the new input position), so we won't have to check space
1694 when there are no errors in the rest of the string) */
1695 repptr = PyUnicode_AS_UNICODE(repunicode);
1696 repsize = PyUnicode_GET_SIZE(repunicode);
1697 requiredsize = *outpos + repsize + insize-newpos;
1698 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 if (requiredsize<2*outsize)
1700 requiredsize = 2*outsize;
1701 if (_PyUnicode_Resize(output, requiredsize) < 0)
1702 goto onError;
1703 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 }
1705 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001706 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 Py_UNICODE_COPY(*outptr, repptr, repsize);
1708 *outptr += repsize;
1709 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001710
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001711 /* we made it! */
1712 res = 0;
1713
Benjamin Peterson29060642009-01-31 22:14:21 +00001714 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 Py_XDECREF(restuple);
1716 return res;
1717}
1718
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719/* --- UTF-7 Codec -------------------------------------------------------- */
1720
1721/* see RFC2152 for details */
1722
Tim Petersced69f82003-09-16 20:30:58 +00001723static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724char utf7_special[128] = {
1725 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1726 encoded:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001727 0 - not special
1728 1 - special
1729 2 - whitespace (optional)
1730 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1732 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1733 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1735 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1737 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1739
1740};
1741
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001742/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1743 warnings about the comparison always being false; since
1744 utf7_special[0] is 1, we can safely make that one comparison
1745 true */
1746
Benjamin Peterson29060642009-01-31 22:14:21 +00001747#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001748 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 (encodeO && (utf7_special[(c)] == 3)))
1751
Benjamin Peterson29060642009-01-31 22:14:21 +00001752#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001753 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson29060642009-01-31 22:14:21 +00001754#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001755 (ISALNUM(c) || (c) == '+' || (c) == '/')
Benjamin Peterson29060642009-01-31 22:14:21 +00001756#define UB64(c) \
1757 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001758 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001760#define ENCODE(out, ch, bits) \
1761 while (bits >= 6) { \
1762 *out++ = B64(ch >> (bits-6)); \
1763 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 }
1765
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001766#define DECODE(out, ch, bits, surrogate) \
1767 while (bits >= 16) { \
1768 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1769 bits -= 16; \
1770 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001771 /* We have already generated an error for the high surrogate \
1772 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001773 surrogate = 0; \
1774 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001776 it in a 16-bit character */ \
1777 surrogate = 1; \
1778 errmsg = "code pairs are not supported"; \
1779 goto utf7Error; \
1780 } else { \
1781 *out++ = outCh; \
1782 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001783 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 Py_ssize_t size,
1787 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001788{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001789 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1790}
1791
1792PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001793 Py_ssize_t size,
1794 const char *errors,
1795 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001796{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001798 Py_ssize_t startinpos;
1799 Py_ssize_t endinpos;
1800 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001801 const char *e;
1802 PyUnicodeObject *unicode;
1803 Py_UNICODE *p;
1804 const char *errmsg = "";
1805 int inShift = 0;
1806 unsigned int bitsleft = 0;
1807 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 int surrogate = 0;
1809 PyObject *errorHandler = NULL;
1810 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001811
1812 unicode = _PyUnicode_New(size);
1813 if (!unicode)
1814 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001815 if (size == 0) {
1816 if (consumed)
1817 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001819 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001820
1821 p = unicode->str;
1822 e = s + size;
1823
1824 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001827 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828
1829 if (inShift) {
1830 if ((ch == '-') || !B64CHAR(ch)) {
1831 inShift = 0;
1832 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001833
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1835 if (bitsleft >= 6) {
1836 /* The shift sequence has a partial character in it. If
1837 bitsleft < 6 then we could just classify it as padding
1838 but that is not the case here */
1839
1840 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001841 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842 }
1843 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001844 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 here so indicate the potential of a misencoded character. */
1846
1847 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1848 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1849 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001850 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 }
1852
1853 if (ch == '-') {
1854 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001855 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 inShift = 1;
1857 }
1858 } else if (SPECIAL(ch,0,0)) {
1859 errmsg = "unexpected special character";
Benjamin Peterson14339b62009-01-31 16:36:08 +00001860 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861 } else {
1862 *p++ = ch;
1863 }
1864 } else {
1865 charsleft = (charsleft << 6) | UB64(ch);
1866 bitsleft += 6;
1867 s++;
1868 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1869 }
1870 }
1871 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001873 s++;
1874 if (s < e && *s == '-') {
1875 s++;
1876 *p++ = '+';
1877 } else
1878 {
1879 inShift = 1;
1880 bitsleft = 0;
1881 }
1882 }
1883 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001884 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001885 errmsg = "unexpected special character";
1886 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001887 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888 }
1889 else {
1890 *p++ = ch;
1891 s++;
1892 }
1893 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00001894 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 outpos = p-PyUnicode_AS_UNICODE(unicode);
1896 endinpos = s-starts;
1897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001898 errors, &errorHandler,
1899 "utf7", errmsg,
1900 &starts, &e, &startinpos, &endinpos, &exc, &s,
1901 &unicode, &outpos, &p))
1902 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 }
1904
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001905 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001906 outpos = p-PyUnicode_AS_UNICODE(unicode);
1907 endinpos = size;
1908 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001909 errors, &errorHandler,
1910 "utf7", "unterminated shift sequence",
1911 &starts, &e, &startinpos, &endinpos, &exc, &s,
1912 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001913 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 if (s < e)
Benjamin Peterson29060642009-01-31 22:14:21 +00001915 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001917 if (consumed) {
1918 if(inShift)
1919 *consumed = startinpos;
1920 else
1921 *consumed = s-starts;
1922 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001923
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001924 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001925 goto onError;
1926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 Py_XDECREF(errorHandler);
1928 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001929 return (PyObject *)unicode;
1930
Benjamin Peterson29060642009-01-31 22:14:21 +00001931 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 Py_XDECREF(errorHandler);
1933 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934 Py_DECREF(unicode);
1935 return NULL;
1936}
1937
1938
1939PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 Py_ssize_t size,
1941 int encodeSetO,
1942 int encodeWhiteSpace,
1943 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001944{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001945 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001946 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001947 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001948 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001949 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950 unsigned int bitsleft = 0;
1951 unsigned long charsleft = 0;
1952 char * out;
1953 char * start;
1954
1955 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001958 if (cbAllocated / 5 != size)
1959 return PyErr_NoMemory();
1960
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001961 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 if (v == NULL)
1963 return NULL;
1964
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001965 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001966 for (;i < size; ++i) {
1967 Py_UNICODE ch = s[i];
1968
1969 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001970 if (ch == '+') {
1971 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 *out++ = '-';
1973 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1974 charsleft = ch;
1975 bitsleft = 16;
1976 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001977 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001979 } else {
1980 *out++ = (char) ch;
1981 }
1982 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001983 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1984 *out++ = B64(charsleft << (6-bitsleft));
1985 charsleft = 0;
1986 bitsleft = 0;
1987 /* Characters not in the BASE64 set implicitly unshift the sequence
1988 so no '-' is required, except if the character is itself a '-' */
1989 if (B64CHAR(ch) || ch == '-') {
1990 *out++ = '-';
1991 }
1992 inShift = 0;
1993 *out++ = (char) ch;
1994 } else {
1995 bitsleft += 16;
1996 charsleft = (charsleft << 16) | ch;
1997 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1998
Mark Dickinson934896d2009-02-21 20:59:32 +00001999 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00002000 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002001 or '-' then the shift sequence will be terminated implicitly and we
2002 don't have to insert a '-'. */
2003
2004 if (bitsleft == 0) {
2005 if (i + 1 < size) {
2006 Py_UNICODE ch2 = s[i+1];
2007
2008 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00002009
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 } else if (B64CHAR(ch2) || ch2 == '-') {
2011 *out++ = '-';
2012 inShift = 0;
2013 } else {
2014 inShift = 0;
2015 }
2016
2017 }
2018 else {
2019 *out++ = '-';
2020 inShift = 0;
2021 }
2022 }
Tim Petersced69f82003-09-16 20:30:58 +00002023 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002024 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002026 if (bitsleft) {
2027 *out++= B64(charsleft << (6-bitsleft) );
2028 *out++ = '-';
2029 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002030 if (_PyBytes_Resize(&v, out - start) < 0)
2031 return NULL;
2032 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002033}
2034
2035#undef SPECIAL
2036#undef B64
2037#undef B64CHAR
2038#undef UB64
2039#undef ENCODE
2040#undef DECODE
2041
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042/* --- UTF-8 Codec -------------------------------------------------------- */
2043
Tim Petersced69f82003-09-16 20:30:58 +00002044static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045char utf8_code_length[256] = {
2046 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2047 illegal prefix. see RFC 2279 for details */
2048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2051 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2052 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2053 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2054 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2056 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2057 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2060 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2061 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2062 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2063 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2064};
2065
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 Py_ssize_t size,
2068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Walter Dörwald69652032004-09-07 20:24:22 +00002070 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2071}
2072
Antoine Pitrouab868312009-01-10 15:40:25 +00002073/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2074#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2075
2076/* Mask to quickly check whether a C 'long' contains a
2077 non-ASCII, UTF8-encoded char. */
2078#if (SIZEOF_LONG == 8)
2079# define ASCII_CHAR_MASK 0x8080808080808080L
2080#elif (SIZEOF_LONG == 4)
2081# define ASCII_CHAR_MASK 0x80808080L
2082#else
2083# error C 'long' size should be either 4 or 8!
2084#endif
2085
Walter Dörwald69652032004-09-07 20:24:22 +00002086PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 Py_ssize_t size,
2088 const char *errors,
2089 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002093 Py_ssize_t startinpos;
2094 Py_ssize_t endinpos;
2095 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002096 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 PyUnicodeObject *unicode;
2098 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002099 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 PyObject *errorHandler = NULL;
2101 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102
2103 /* Note: size will always be longer than the resulting Unicode
2104 character count */
2105 unicode = _PyUnicode_New(size);
2106 if (!unicode)
2107 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002108 if (size == 0) {
2109 if (consumed)
2110 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113
2114 /* Unpack UTF-8 encoded data */
2115 p = unicode->str;
2116 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002117 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118
2119 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121
2122 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002123 /* Fast path for runs of ASCII characters. Given that common UTF-8
2124 input will consist of an overwhelming majority of ASCII
2125 characters, we try to optimize for this case by checking
2126 as many characters as a C 'long' can contain.
2127 First, check if we can do an aligned read, as most CPUs have
2128 a penalty for unaligned reads.
2129 */
2130 if (!((size_t) s & LONG_PTR_MASK)) {
2131 /* Help register allocation */
2132 register const char *_s = s;
2133 register Py_UNICODE *_p = p;
2134 while (_s < aligned_end) {
2135 /* Read a whole long at a time (either 4 or 8 bytes),
2136 and do a fast unrolled copy if it only contains ASCII
2137 characters. */
2138 unsigned long data = *(unsigned long *) _s;
2139 if (data & ASCII_CHAR_MASK)
2140 break;
2141 _p[0] = (unsigned char) _s[0];
2142 _p[1] = (unsigned char) _s[1];
2143 _p[2] = (unsigned char) _s[2];
2144 _p[3] = (unsigned char) _s[3];
2145#if (SIZEOF_LONG == 8)
2146 _p[4] = (unsigned char) _s[4];
2147 _p[5] = (unsigned char) _s[5];
2148 _p[6] = (unsigned char) _s[6];
2149 _p[7] = (unsigned char) _s[7];
2150#endif
2151 _s += SIZEOF_LONG;
2152 _p += SIZEOF_LONG;
2153 }
2154 s = _s;
2155 p = _p;
2156 if (s == e)
2157 break;
2158 ch = (unsigned char)*s;
2159 }
2160 }
2161
2162 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002163 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 s++;
2165 continue;
2166 }
2167
2168 n = utf8_code_length[ch];
2169
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002170 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002171 if (consumed)
2172 break;
2173 else {
2174 errmsg = "unexpected end of data";
2175 startinpos = s-starts;
2176 endinpos = size;
2177 goto utf8Error;
2178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180
2181 switch (n) {
2182
2183 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002184 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002185 startinpos = s-starts;
2186 endinpos = startinpos+1;
2187 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
2189 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002190 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002191 startinpos = s-starts;
2192 endinpos = startinpos+1;
2193 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194
2195 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002196 if ((s[1] & 0xc0) != 0x80) {
2197 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 startinpos = s-starts;
2199 endinpos = startinpos+2;
2200 goto utf8Error;
2201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002203 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002204 startinpos = s-starts;
2205 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002206 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002207 goto utf8Error;
2208 }
2209 else
2210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 break;
2212
2213 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002214 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002215 (s[2] & 0xc0) != 0x80) {
2216 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002217 startinpos = s-starts;
2218 endinpos = startinpos+3;
2219 goto utf8Error;
2220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002222 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002223 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 startinpos = s-starts;
2225 endinpos = startinpos+3;
2226 goto utf8Error;
2227 }
2228 else
2229 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002230 break;
2231
2232 case 4:
2233 if ((s[1] & 0xc0) != 0x80 ||
2234 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002235 (s[3] & 0xc0) != 0x80) {
2236 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002237 startinpos = s-starts;
2238 endinpos = startinpos+4;
2239 goto utf8Error;
2240 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002241 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002242 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002244 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002245 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002246 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002247 UTF-16 */
2248 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002249 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002250 startinpos = s-starts;
2251 endinpos = startinpos+4;
2252 goto utf8Error;
2253 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002254#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002256#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002257 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002258
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 /* translate from 10000..10FFFF to 0..FFFF */
2260 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002261
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002262 /* high surrogate = top 10 bits added to D800 */
2263 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002264
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002265 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002266 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002267#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 break;
2269
2270 default:
2271 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002272 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 startinpos = s-starts;
2274 endinpos = startinpos+n;
2275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002278 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002279
Benjamin Peterson29060642009-01-31 22:14:21 +00002280 utf8Error:
2281 outpos = p-PyUnicode_AS_UNICODE(unicode);
2282 if (unicode_decode_call_errorhandler(
2283 errors, &errorHandler,
2284 "utf8", errmsg,
2285 &starts, &e, &startinpos, &endinpos, &exc, &s,
2286 &unicode, &outpos, &p))
2287 goto onError;
2288 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 }
Walter Dörwald69652032004-09-07 20:24:22 +00002290 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002291 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002294 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 goto onError;
2296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002297 Py_XDECREF(errorHandler);
2298 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 return (PyObject *)unicode;
2300
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002302 Py_XDECREF(errorHandler);
2303 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 Py_DECREF(unicode);
2305 return NULL;
2306}
2307
Antoine Pitrouab868312009-01-10 15:40:25 +00002308#undef ASCII_CHAR_MASK
2309
2310
Tim Peters602f7402002-04-27 18:03:26 +00002311/* Allocation strategy: if the string is short, convert into a stack buffer
2312 and allocate exactly as much space needed at the end. Else allocate the
2313 maximum possible needed (4 result bytes per Unicode character), and return
2314 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002315*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002316PyObject *
2317PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002318 Py_ssize_t size,
2319 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320{
Tim Peters602f7402002-04-27 18:03:26 +00002321#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002322
Guido van Rossum98297ee2007-11-06 21:34:58 +00002323 Py_ssize_t i; /* index into s of next input byte */
2324 PyObject *result; /* result string object */
2325 char *p; /* next free byte in output buffer */
2326 Py_ssize_t nallocated; /* number of result bytes allocated */
2327 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002328 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002329 PyObject *errorHandler = NULL;
2330 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002331
Tim Peters602f7402002-04-27 18:03:26 +00002332 assert(s != NULL);
2333 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Tim Peters602f7402002-04-27 18:03:26 +00002335 if (size <= MAX_SHORT_UNICHARS) {
2336 /* Write into the stack buffer; nallocated can't overflow.
2337 * At the end, we'll allocate exactly as much heap space as it
2338 * turns out we need.
2339 */
2340 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002341 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002342 p = stackbuf;
2343 }
2344 else {
2345 /* Overallocate on the heap, and give the excess back at the end. */
2346 nallocated = size * 4;
2347 if (nallocated / 4 != size) /* overflow! */
2348 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002349 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002350 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002351 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002352 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002353 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002354
Tim Peters602f7402002-04-27 18:03:26 +00002355 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002356 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002357
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002358 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002359 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002361
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002363 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002364 *p++ = (char)(0xc0 | (ch >> 6));
2365 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002366 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002367 else {
Tim Peters602f7402002-04-27 18:03:26 +00002368 /* Encode UCS2 Unicode ordinals */
2369 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002370#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002371 /* Special case: check for high surrogate */
2372 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2373 Py_UCS4 ch2 = s[i];
2374 /* Check for low surrogate and combine the two to
2375 form a UCS4 value */
2376 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002377 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002378 i++;
2379 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002380 }
Tim Peters602f7402002-04-27 18:03:26 +00002381 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002382 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002383#endif
2384 if (ch >= 0xd800 && ch <= 0xdfff) {
2385 Py_ssize_t newpos;
2386 PyObject *rep;
2387 char *prep;
2388 int k;
2389 rep = unicode_encode_call_errorhandler
2390 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2391 s, size, &exc, i-1, i, &newpos);
2392 if (!rep)
2393 goto error;
2394 /* Implementation limitations: only support error handler that return
2395 bytes, and only support up to four replacement bytes. */
2396 if (!PyBytes_Check(rep)) {
2397 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2398 Py_DECREF(rep);
2399 goto error;
2400 }
2401 if (PyBytes_Size(rep) > 4) {
2402 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2403 Py_DECREF(rep);
2404 goto error;
2405 }
2406 prep = PyBytes_AsString(rep);
2407 for(k = PyBytes_Size(rep); k > 0; k--)
2408 *p++ = *prep++;
2409 Py_DECREF(rep);
2410 continue;
2411
2412 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002413 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002414 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2415 *p++ = (char)(0x80 | (ch & 0x3f));
2416 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002417 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002418 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002419 /* Encode UCS4 Unicode ordinals */
2420 *p++ = (char)(0xf0 | (ch >> 18));
2421 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2422 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2423 *p++ = (char)(0x80 | (ch & 0x3f));
2424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002426
Guido van Rossum98297ee2007-11-06 21:34:58 +00002427 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002428 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002429 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002430 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002431 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002432 }
2433 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002434 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002435 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002436 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002437 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002438 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002439 Py_XDECREF(errorHandler);
2440 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002441 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002442 error:
2443 Py_XDECREF(errorHandler);
2444 Py_XDECREF(exc);
2445 Py_XDECREF(result);
2446 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002447
Tim Peters602f7402002-04-27 18:03:26 +00002448#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449}
2450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2452{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 if (!PyUnicode_Check(unicode)) {
2454 PyErr_BadArgument();
2455 return NULL;
2456 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002457 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 PyUnicode_GET_SIZE(unicode),
2459 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460}
2461
Walter Dörwald41980ca2007-08-16 21:55:45 +00002462/* --- UTF-32 Codec ------------------------------------------------------- */
2463
2464PyObject *
2465PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 Py_ssize_t size,
2467 const char *errors,
2468 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002469{
2470 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2471}
2472
2473PyObject *
2474PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002475 Py_ssize_t size,
2476 const char *errors,
2477 int *byteorder,
2478 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002479{
2480 const char *starts = s;
2481 Py_ssize_t startinpos;
2482 Py_ssize_t endinpos;
2483 Py_ssize_t outpos;
2484 PyUnicodeObject *unicode;
2485 Py_UNICODE *p;
2486#ifndef Py_UNICODE_WIDE
2487 int i, pairs;
2488#else
2489 const int pairs = 0;
2490#endif
2491 const unsigned char *q, *e;
2492 int bo = 0; /* assume native ordering by default */
2493 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002494 /* Offsets from q for retrieving bytes in the right order. */
2495#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2496 int iorder[] = {0, 1, 2, 3};
2497#else
2498 int iorder[] = {3, 2, 1, 0};
2499#endif
2500 PyObject *errorHandler = NULL;
2501 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002502 /* On narrow builds we split characters outside the BMP into two
2503 codepoints => count how much extra space we need. */
2504#ifndef Py_UNICODE_WIDE
2505 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002506 if (((Py_UCS4 *)s)[i] >= 0x10000)
2507 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002508#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002509
2510 /* This might be one to much, because of a BOM */
2511 unicode = _PyUnicode_New((size+3)/4+pairs);
2512 if (!unicode)
2513 return NULL;
2514 if (size == 0)
2515 return (PyObject *)unicode;
2516
2517 /* Unpack UTF-32 encoded data */
2518 p = unicode->str;
2519 q = (unsigned char *)s;
2520 e = q + size;
2521
2522 if (byteorder)
2523 bo = *byteorder;
2524
2525 /* Check for BOM marks (U+FEFF) in the input and adjust current
2526 byte order setting accordingly. In native mode, the leading BOM
2527 mark is skipped, in all other modes, it is copied to the output
2528 stream as-is (giving a ZWNBSP character). */
2529 if (bo == 0) {
2530 if (size >= 4) {
2531 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002532 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002533#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 if (bom == 0x0000FEFF) {
2535 q += 4;
2536 bo = -1;
2537 }
2538 else if (bom == 0xFFFE0000) {
2539 q += 4;
2540 bo = 1;
2541 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002542#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 if (bom == 0x0000FEFF) {
2544 q += 4;
2545 bo = 1;
2546 }
2547 else if (bom == 0xFFFE0000) {
2548 q += 4;
2549 bo = -1;
2550 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002551#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002552 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002553 }
2554
2555 if (bo == -1) {
2556 /* force LE */
2557 iorder[0] = 0;
2558 iorder[1] = 1;
2559 iorder[2] = 2;
2560 iorder[3] = 3;
2561 }
2562 else if (bo == 1) {
2563 /* force BE */
2564 iorder[0] = 3;
2565 iorder[1] = 2;
2566 iorder[2] = 1;
2567 iorder[3] = 0;
2568 }
2569
2570 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002571 Py_UCS4 ch;
2572 /* remaining bytes at the end? (size should be divisible by 4) */
2573 if (e-q<4) {
2574 if (consumed)
2575 break;
2576 errmsg = "truncated data";
2577 startinpos = ((const char *)q)-starts;
2578 endinpos = ((const char *)e)-starts;
2579 goto utf32Error;
2580 /* The remaining input chars are ignored if the callback
2581 chooses to skip the input */
2582 }
2583 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2584 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002585
Benjamin Peterson29060642009-01-31 22:14:21 +00002586 if (ch >= 0x110000)
2587 {
2588 errmsg = "codepoint not in range(0x110000)";
2589 startinpos = ((const char *)q)-starts;
2590 endinpos = startinpos+4;
2591 goto utf32Error;
2592 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002593#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002594 if (ch >= 0x10000)
2595 {
2596 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2597 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2598 }
2599 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002600#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002601 *p++ = ch;
2602 q += 4;
2603 continue;
2604 utf32Error:
2605 outpos = p-PyUnicode_AS_UNICODE(unicode);
2606 if (unicode_decode_call_errorhandler(
2607 errors, &errorHandler,
2608 "utf32", errmsg,
2609 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2610 &unicode, &outpos, &p))
2611 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002612 }
2613
2614 if (byteorder)
2615 *byteorder = bo;
2616
2617 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002618 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002619
2620 /* Adjust length */
2621 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2622 goto onError;
2623
2624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
2626 return (PyObject *)unicode;
2627
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002629 Py_DECREF(unicode);
2630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
2632 return NULL;
2633}
2634
2635PyObject *
2636PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002637 Py_ssize_t size,
2638 const char *errors,
2639 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002641 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002642 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002643 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002644#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002645 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002646#else
2647 const int pairs = 0;
2648#endif
2649 /* Offsets from p for storing byte pairs in the right order. */
2650#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int iorder[] = {0, 1, 2, 3};
2652#else
2653 int iorder[] = {3, 2, 1, 0};
2654#endif
2655
Benjamin Peterson29060642009-01-31 22:14:21 +00002656#define STORECHAR(CH) \
2657 do { \
2658 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2659 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2660 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2661 p[iorder[0]] = (CH) & 0xff; \
2662 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002663 } while(0)
2664
2665 /* In narrow builds we can output surrogate pairs as one codepoint,
2666 so we need less space. */
2667#ifndef Py_UNICODE_WIDE
2668 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2670 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2671 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002672#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002673 nsize = (size - pairs + (byteorder == 0));
2674 bytesize = nsize * 4;
2675 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002677 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002678 if (v == NULL)
2679 return NULL;
2680
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002681 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002682 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002684 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002685 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002686
2687 if (byteorder == -1) {
2688 /* force LE */
2689 iorder[0] = 0;
2690 iorder[1] = 1;
2691 iorder[2] = 2;
2692 iorder[3] = 3;
2693 }
2694 else if (byteorder == 1) {
2695 /* force BE */
2696 iorder[0] = 3;
2697 iorder[1] = 2;
2698 iorder[2] = 1;
2699 iorder[3] = 0;
2700 }
2701
2702 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002703 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002704#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2706 Py_UCS4 ch2 = *s;
2707 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2708 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2709 s++;
2710 size--;
2711 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002713#endif
2714 STORECHAR(ch);
2715 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002716
2717 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002718 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719#undef STORECHAR
2720}
2721
2722PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2723{
2724 if (!PyUnicode_Check(unicode)) {
2725 PyErr_BadArgument();
2726 return NULL;
2727 }
2728 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 PyUnicode_GET_SIZE(unicode),
2730 NULL,
2731 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002732}
2733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734/* --- UTF-16 Codec ------------------------------------------------------- */
2735
Tim Peters772747b2001-08-09 22:21:55 +00002736PyObject *
2737PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 Py_ssize_t size,
2739 const char *errors,
2740 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741{
Walter Dörwald69652032004-09-07 20:24:22 +00002742 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2743}
2744
Antoine Pitrouab868312009-01-10 15:40:25 +00002745/* Two masks for fast checking of whether a C 'long' may contain
2746 UTF16-encoded surrogate characters. This is an efficient heuristic,
2747 assuming that non-surrogate characters with a code point >= 0x8000 are
2748 rare in most input.
2749 FAST_CHAR_MASK is used when the input is in native byte ordering,
2750 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002751*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002752#if (SIZEOF_LONG == 8)
2753# define FAST_CHAR_MASK 0x8000800080008000L
2754# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2755#elif (SIZEOF_LONG == 4)
2756# define FAST_CHAR_MASK 0x80008000L
2757# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2758#else
2759# error C 'long' size should be either 4 or 8!
2760#endif
2761
Walter Dörwald69652032004-09-07 20:24:22 +00002762PyObject *
2763PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 Py_ssize_t size,
2765 const char *errors,
2766 int *byteorder,
2767 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002770 Py_ssize_t startinpos;
2771 Py_ssize_t endinpos;
2772 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 PyUnicodeObject *unicode;
2774 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002775 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002776 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002777 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002778 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002779 /* Offsets from q for retrieving byte pairs in the right order. */
2780#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2781 int ihi = 1, ilo = 0;
2782#else
2783 int ihi = 0, ilo = 1;
2784#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 PyObject *errorHandler = NULL;
2786 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787
2788 /* Note: size will always be longer than the resulting Unicode
2789 character count */
2790 unicode = _PyUnicode_New(size);
2791 if (!unicode)
2792 return NULL;
2793 if (size == 0)
2794 return (PyObject *)unicode;
2795
2796 /* Unpack UTF-16 encoded data */
2797 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002798 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002799 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
2801 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002802 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002804 /* Check for BOM marks (U+FEFF) in the input and adjust current
2805 byte order setting accordingly. In native mode, the leading BOM
2806 mark is skipped, in all other modes, it is copied to the output
2807 stream as-is (giving a ZWNBSP character). */
2808 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002809 if (size >= 2) {
2810 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002811#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 if (bom == 0xFEFF) {
2813 q += 2;
2814 bo = -1;
2815 }
2816 else if (bom == 0xFFFE) {
2817 q += 2;
2818 bo = 1;
2819 }
Tim Petersced69f82003-09-16 20:30:58 +00002820#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002821 if (bom == 0xFEFF) {
2822 q += 2;
2823 bo = 1;
2824 }
2825 else if (bom == 0xFFFE) {
2826 q += 2;
2827 bo = -1;
2828 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002829#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832
Tim Peters772747b2001-08-09 22:21:55 +00002833 if (bo == -1) {
2834 /* force LE */
2835 ihi = 1;
2836 ilo = 0;
2837 }
2838 else if (bo == 1) {
2839 /* force BE */
2840 ihi = 0;
2841 ilo = 1;
2842 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002843#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2844 native_ordering = ilo < ihi;
2845#else
2846 native_ordering = ilo > ihi;
2847#endif
Tim Peters772747b2001-08-09 22:21:55 +00002848
Antoine Pitrouab868312009-01-10 15:40:25 +00002849 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002850 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002852 /* First check for possible aligned read of a C 'long'. Unaligned
2853 reads are more expensive, better to defer to another iteration. */
2854 if (!((size_t) q & LONG_PTR_MASK)) {
2855 /* Fast path for runs of non-surrogate chars. */
2856 register const unsigned char *_q = q;
2857 Py_UNICODE *_p = p;
2858 if (native_ordering) {
2859 /* Native ordering is simple: as long as the input cannot
2860 possibly contain a surrogate char, do an unrolled copy
2861 of several 16-bit code points to the target object.
2862 The non-surrogate check is done on several input bytes
2863 at a time (as many as a C 'long' can contain). */
2864 while (_q < aligned_end) {
2865 unsigned long data = * (unsigned long *) _q;
2866 if (data & FAST_CHAR_MASK)
2867 break;
2868 _p[0] = ((unsigned short *) _q)[0];
2869 _p[1] = ((unsigned short *) _q)[1];
2870#if (SIZEOF_LONG == 8)
2871 _p[2] = ((unsigned short *) _q)[2];
2872 _p[3] = ((unsigned short *) _q)[3];
2873#endif
2874 _q += SIZEOF_LONG;
2875 _p += SIZEOF_LONG / 2;
2876 }
2877 }
2878 else {
2879 /* Byteswapped ordering is similar, but we must decompose
2880 the copy bytewise, and take care of zero'ing out the
2881 upper bytes if the target object is in 32-bit units
2882 (that is, in UCS-4 builds). */
2883 while (_q < aligned_end) {
2884 unsigned long data = * (unsigned long *) _q;
2885 if (data & SWAPPED_FAST_CHAR_MASK)
2886 break;
2887 /* Zero upper bytes in UCS-4 builds */
2888#if (Py_UNICODE_SIZE > 2)
2889 _p[0] = 0;
2890 _p[1] = 0;
2891#if (SIZEOF_LONG == 8)
2892 _p[2] = 0;
2893 _p[3] = 0;
2894#endif
2895#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002896 /* Issue #4916; UCS-4 builds on big endian machines must
2897 fill the two last bytes of each 4-byte unit. */
2898#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2899# define OFF 2
2900#else
2901# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002902#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002903 ((unsigned char *) _p)[OFF + 1] = _q[0];
2904 ((unsigned char *) _p)[OFF + 0] = _q[1];
2905 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2906 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2907#if (SIZEOF_LONG == 8)
2908 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2909 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2910 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2911 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2912#endif
2913#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002914 _q += SIZEOF_LONG;
2915 _p += SIZEOF_LONG / 2;
2916 }
2917 }
2918 p = _p;
2919 q = _q;
2920 if (q >= e)
2921 break;
2922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002926
2927 if (ch < 0xD800 || ch > 0xDFFF) {
2928 *p++ = ch;
2929 continue;
2930 }
2931
2932 /* UTF-16 code pair: */
2933 if (q > e) {
2934 errmsg = "unexpected end of data";
2935 startinpos = (((const char *)q) - 2) - starts;
2936 endinpos = ((const char *)e) + 1 - starts;
2937 goto utf16Error;
2938 }
2939 if (0xD800 <= ch && ch <= 0xDBFF) {
2940 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2941 q += 2;
2942 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002943#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002944 *p++ = ch;
2945 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002946#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002948#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 continue;
2950 }
2951 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002952 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 startinpos = (((const char *)q)-4)-starts;
2954 endinpos = startinpos+2;
2955 goto utf16Error;
2956 }
2957
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002959 errmsg = "illegal encoding";
2960 startinpos = (((const char *)q)-2)-starts;
2961 endinpos = startinpos+2;
2962 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002963
Benjamin Peterson29060642009-01-31 22:14:21 +00002964 utf16Error:
2965 outpos = p - PyUnicode_AS_UNICODE(unicode);
2966 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002967 errors,
2968 &errorHandler,
2969 "utf16", errmsg,
2970 &starts,
2971 (const char **)&e,
2972 &startinpos,
2973 &endinpos,
2974 &exc,
2975 (const char **)&q,
2976 &unicode,
2977 &outpos,
2978 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002981 /* remaining byte at the end? (size should be even) */
2982 if (e == q) {
2983 if (!consumed) {
2984 errmsg = "truncated data";
2985 startinpos = ((const char *)q) - starts;
2986 endinpos = ((const char *)e) + 1 - starts;
2987 outpos = p - PyUnicode_AS_UNICODE(unicode);
2988 if (unicode_decode_call_errorhandler(
2989 errors,
2990 &errorHandler,
2991 "utf16", errmsg,
2992 &starts,
2993 (const char **)&e,
2994 &startinpos,
2995 &endinpos,
2996 &exc,
2997 (const char **)&q,
2998 &unicode,
2999 &outpos,
3000 &p))
3001 goto onError;
3002 /* The remaining input chars are ignored if the callback
3003 chooses to skip the input */
3004 }
3005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006
3007 if (byteorder)
3008 *byteorder = bo;
3009
Walter Dörwald69652032004-09-07 20:24:22 +00003010 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003012
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003014 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 goto onError;
3016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 Py_XDECREF(errorHandler);
3018 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 return (PyObject *)unicode;
3020
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 Py_XDECREF(errorHandler);
3024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 return NULL;
3026}
3027
Antoine Pitrouab868312009-01-10 15:40:25 +00003028#undef FAST_CHAR_MASK
3029#undef SWAPPED_FAST_CHAR_MASK
3030
Tim Peters772747b2001-08-09 22:21:55 +00003031PyObject *
3032PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 Py_ssize_t size,
3034 const char *errors,
3035 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003037 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003038 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003039 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003040#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003041 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003042#else
3043 const int pairs = 0;
3044#endif
Tim Peters772747b2001-08-09 22:21:55 +00003045 /* Offsets from p for storing byte pairs in the right order. */
3046#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3047 int ihi = 1, ilo = 0;
3048#else
3049 int ihi = 0, ilo = 1;
3050#endif
3051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052#define STORECHAR(CH) \
3053 do { \
3054 p[ihi] = ((CH) >> 8) & 0xff; \
3055 p[ilo] = (CH) & 0xff; \
3056 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003057 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003059#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003060 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 if (s[i] >= 0x10000)
3062 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003063#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003064 /* 2 * (size + pairs + (byteorder == 0)) */
3065 if (size > PY_SSIZE_T_MAX ||
3066 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003068 nsize = size + pairs + (byteorder == 0);
3069 bytesize = nsize * 2;
3070 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003072 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (v == NULL)
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003076 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003079 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003080 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003081
3082 if (byteorder == -1) {
3083 /* force LE */
3084 ihi = 1;
3085 ilo = 0;
3086 }
3087 else if (byteorder == 1) {
3088 /* force BE */
3089 ihi = 0;
3090 ilo = 1;
3091 }
3092
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003093 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 Py_UNICODE ch = *s++;
3095 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003096#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 if (ch >= 0x10000) {
3098 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3099 ch = 0xD800 | ((ch-0x10000) >> 10);
3100 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003101#endif
Tim Peters772747b2001-08-09 22:21:55 +00003102 STORECHAR(ch);
3103 if (ch2)
3104 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003105 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003106
3107 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003108 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003109#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110}
3111
3112PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3113{
3114 if (!PyUnicode_Check(unicode)) {
3115 PyErr_BadArgument();
3116 return NULL;
3117 }
3118 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003119 PyUnicode_GET_SIZE(unicode),
3120 NULL,
3121 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122}
3123
3124/* --- Unicode Escape Codec ----------------------------------------------- */
3125
Fredrik Lundh06d12682001-01-24 07:59:11 +00003126static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003127
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 Py_ssize_t size,
3130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t startinpos;
3134 Py_ssize_t endinpos;
3135 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003140 char* message;
3141 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 PyObject *errorHandler = NULL;
3143 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003144
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 /* Escaped strings will always be longer than the resulting
3146 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 length after conversion to the true value.
3148 (but if the error callback returns a long replacement string
3149 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 v = _PyUnicode_New(size);
3151 if (v == NULL)
3152 goto onError;
3153 if (size == 0)
3154 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003158
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 while (s < end) {
3160 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003161 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163
3164 /* Non-escape characters are interpreted as Unicode ordinals */
3165 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003166 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 continue;
3168 }
3169
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 /* \ - Escapes */
3172 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003173 c = *s++;
3174 if (s > end)
3175 c = '\0'; /* Invalid after \ */
3176 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 case '\n': break;
3180 case '\\': *p++ = '\\'; break;
3181 case '\'': *p++ = '\''; break;
3182 case '\"': *p++ = '\"'; break;
3183 case 'b': *p++ = '\b'; break;
3184 case 'f': *p++ = '\014'; break; /* FF */
3185 case 't': *p++ = '\t'; break;
3186 case 'n': *p++ = '\n'; break;
3187 case 'r': *p++ = '\r'; break;
3188 case 'v': *p++ = '\013'; break; /* VT */
3189 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 case '0': case '1': case '2': case '3':
3193 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003194 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003195 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003196 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003197 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003198 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003200 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 break;
3202
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 /* hex escapes */
3204 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003206 digits = 2;
3207 message = "truncated \\xXX escape";
3208 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003212 digits = 4;
3213 message = "truncated \\uXXXX escape";
3214 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003217 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003218 digits = 8;
3219 message = "truncated \\UXXXXXXXX escape";
3220 hexescape:
3221 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 outpos = p-PyUnicode_AS_UNICODE(v);
3223 if (s+digits>end) {
3224 endinpos = size;
3225 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 errors, &errorHandler,
3227 "unicodeescape", "end of string in escape sequence",
3228 &starts, &end, &startinpos, &endinpos, &exc, &s,
3229 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 goto onError;
3231 goto nextByte;
3232 }
3233 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003234 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003235 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 endinpos = (s+i+1)-starts;
3237 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 errors, &errorHandler,
3239 "unicodeescape", message,
3240 &starts, &end, &startinpos, &endinpos, &exc, &s,
3241 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003242 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003244 }
3245 chr = (chr<<4) & ~0xF;
3246 if (c >= '0' && c <= '9')
3247 chr += c - '0';
3248 else if (c >= 'a' && c <= 'f')
3249 chr += 10 + c - 'a';
3250 else
3251 chr += 10 + c - 'A';
3252 }
3253 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003254 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 /* _decoding_error will have already written into the
3256 target buffer. */
3257 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003258 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003259 /* when we get here, chr is a 32-bit unicode character */
3260 if (chr <= 0xffff)
3261 /* UCS-2 character */
3262 *p++ = (Py_UNICODE) chr;
3263 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003264 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003265 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003266#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003267 *p++ = chr;
3268#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003269 chr -= 0x10000L;
3270 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003271 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003272#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003273 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 endinpos = s-starts;
3275 outpos = p-PyUnicode_AS_UNICODE(v);
3276 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 errors, &errorHandler,
3278 "unicodeescape", "illegal Unicode character",
3279 &starts, &end, &startinpos, &endinpos, &exc, &s,
3280 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003281 goto onError;
3282 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003283 break;
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003286 case 'N':
3287 message = "malformed \\N character escape";
3288 if (ucnhash_CAPI == NULL) {
3289 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003290 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003291 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003292 if (m == NULL)
3293 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003294 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003295 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003296 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003297 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003298 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003299 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003300 if (ucnhash_CAPI == NULL)
3301 goto ucnhashError;
3302 }
3303 if (*s == '{') {
3304 const char *start = s+1;
3305 /* look for the closing brace */
3306 while (*s != '}' && s < end)
3307 s++;
3308 if (s > start && s < end && *s == '}') {
3309 /* found a name. look it up in the unicode database */
3310 message = "unknown Unicode character name";
3311 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003312 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003313 goto store;
3314 }
3315 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 endinpos = s-starts;
3317 outpos = p-PyUnicode_AS_UNICODE(v);
3318 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 errors, &errorHandler,
3320 "unicodeescape", message,
3321 &starts, &end, &startinpos, &endinpos, &exc, &s,
3322 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003323 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003324 break;
3325
3326 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003327 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 message = "\\ at end of string";
3329 s--;
3330 endinpos = s-starts;
3331 outpos = p-PyUnicode_AS_UNICODE(v);
3332 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 errors, &errorHandler,
3334 "unicodeescape", message,
3335 &starts, &end, &startinpos, &endinpos, &exc, &s,
3336 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003337 goto onError;
3338 }
3339 else {
3340 *p++ = '\\';
3341 *p++ = (unsigned char)s[-1];
3342 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003343 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003348 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003350 Py_XDECREF(errorHandler);
3351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003353
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003355 PyErr_SetString(
3356 PyExc_UnicodeError,
3357 "\\N escapes not supported (can't load unicodedata module)"
3358 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003359 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 Py_XDECREF(errorHandler);
3361 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003362 return NULL;
3363
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_XDECREF(errorHandler);
3367 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 return NULL;
3369}
3370
3371/* Return a Unicode-Escape string version of the Unicode object.
3372
3373 If quotes is true, the string is enclosed in u"" or u'' quotes as
3374 appropriate.
3375
3376*/
3377
Thomas Wouters477c8d52006-05-27 19:21:47 +00003378Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003379 Py_ssize_t size,
3380 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003381{
3382 /* like wcschr, but doesn't stop at NULL characters */
3383
3384 while (size-- > 0) {
3385 if (*s == ch)
3386 return s;
3387 s++;
3388 }
3389
3390 return NULL;
3391}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003392
Walter Dörwald79e913e2007-05-12 11:08:06 +00003393static const char *hexdigits = "0123456789abcdef";
3394
3395PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003398 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003401#ifdef Py_UNICODE_WIDE
3402 const Py_ssize_t expandsize = 10;
3403#else
3404 const Py_ssize_t expandsize = 6;
3405#endif
3406
Thomas Wouters89f507f2006-12-13 04:49:30 +00003407 /* XXX(nnorwitz): rather than over-allocating, it would be
3408 better to choose a different scheme. Perhaps scan the
3409 first N-chars of the string and allocate based on that size.
3410 */
3411 /* Initial allocation is based on the longest-possible unichr
3412 escape.
3413
3414 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3415 unichr, so in this case it's the longest unichr escape. In
3416 narrow (UTF-16) builds this is five chars per source unichr
3417 since there are two unichrs in the surrogate pair, so in narrow
3418 (UTF-16) builds it's not the longest unichr escape.
3419
3420 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3421 so in the narrow (UTF-16) build case it's the longest unichr
3422 escape.
3423 */
3424
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003425 if (size == 0)
3426 return PyBytes_FromStringAndSize(NULL, 0);
3427
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003428 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003430
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003431 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 2
3433 + expandsize*size
3434 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 if (repr == NULL)
3436 return NULL;
3437
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003438 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 while (size-- > 0) {
3441 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003442
Walter Dörwald79e913e2007-05-12 11:08:06 +00003443 /* Escape backslashes */
3444 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 *p++ = '\\';
3446 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003447 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003448 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003449
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003450#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003451 /* Map 21-bit characters to '\U00xxxxxx' */
3452 else if (ch >= 0x10000) {
3453 *p++ = '\\';
3454 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003455 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3456 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3457 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3458 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3459 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3460 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3461 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3462 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003463 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003464 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003465#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3467 else if (ch >= 0xD800 && ch < 0xDC00) {
3468 Py_UNICODE ch2;
3469 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003470
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 ch2 = *s++;
3472 size--;
3473 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3474 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3475 *p++ = '\\';
3476 *p++ = 'U';
3477 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3478 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3479 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3480 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3481 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3482 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3483 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3484 *p++ = hexdigits[ucs & 0x0000000F];
3485 continue;
3486 }
3487 /* Fall through: isolated surrogates are copied as-is */
3488 s--;
3489 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003490 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003491#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003492
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003494 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 *p++ = '\\';
3496 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003497 *p++ = hexdigits[(ch >> 12) & 0x000F];
3498 *p++ = hexdigits[(ch >> 8) & 0x000F];
3499 *p++ = hexdigits[(ch >> 4) & 0x000F];
3500 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003502
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003503 /* Map special whitespace to '\t', \n', '\r' */
3504 else if (ch == '\t') {
3505 *p++ = '\\';
3506 *p++ = 't';
3507 }
3508 else if (ch == '\n') {
3509 *p++ = '\\';
3510 *p++ = 'n';
3511 }
3512 else if (ch == '\r') {
3513 *p++ = '\\';
3514 *p++ = 'r';
3515 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003516
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003517 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003518 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003520 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003521 *p++ = hexdigits[(ch >> 4) & 0x000F];
3522 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003523 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003524
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 /* Copy everything else as-is */
3526 else
3527 *p++ = (char) ch;
3528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003530 assert(p - PyBytes_AS_STRING(repr) > 0);
3531 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3532 return NULL;
3533 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534}
3535
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003536PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003538 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 if (!PyUnicode_Check(unicode)) {
3540 PyErr_BadArgument();
3541 return NULL;
3542 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003543 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3544 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003545 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546}
3547
3548/* --- Raw Unicode Escape Codec ------------------------------------------- */
3549
3550PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 Py_ssize_t size,
3552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 Py_ssize_t startinpos;
3556 Py_ssize_t endinpos;
3557 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 const char *end;
3561 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 PyObject *errorHandler = NULL;
3563 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003564
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 /* Escaped strings will always be longer than the resulting
3566 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 length after conversion to the true value. (But decoding error
3568 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 v = _PyUnicode_New(size);
3570 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 end = s + size;
3576 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 unsigned char c;
3578 Py_UCS4 x;
3579 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003580 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 /* Non-escape characters are interpreted as Unicode ordinals */
3583 if (*s != '\\') {
3584 *p++ = (unsigned char)*s++;
3585 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 startinpos = s-starts;
3588
3589 /* \u-escapes are only interpreted iff the number of leading
3590 backslashes if odd */
3591 bs = s;
3592 for (;s < end;) {
3593 if (*s != '\\')
3594 break;
3595 *p++ = (unsigned char)*s++;
3596 }
3597 if (((s - bs) & 1) == 0 ||
3598 s >= end ||
3599 (*s != 'u' && *s != 'U')) {
3600 continue;
3601 }
3602 p--;
3603 count = *s=='u' ? 4 : 8;
3604 s++;
3605
3606 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3607 outpos = p-PyUnicode_AS_UNICODE(v);
3608 for (x = 0, i = 0; i < count; ++i, ++s) {
3609 c = (unsigned char)*s;
3610 if (!ISXDIGIT(c)) {
3611 endinpos = s-starts;
3612 if (unicode_decode_call_errorhandler(
3613 errors, &errorHandler,
3614 "rawunicodeescape", "truncated \\uXXXX",
3615 &starts, &end, &startinpos, &endinpos, &exc, &s,
3616 &v, &outpos, &p))
3617 goto onError;
3618 goto nextByte;
3619 }
3620 x = (x<<4) & ~0xF;
3621 if (c >= '0' && c <= '9')
3622 x += c - '0';
3623 else if (c >= 'a' && c <= 'f')
3624 x += 10 + c - 'a';
3625 else
3626 x += 10 + c - 'A';
3627 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003628 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 /* UCS-2 character */
3630 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003631 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 /* UCS-4 character. Either store directly, or as
3633 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003634#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003636#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 x -= 0x10000L;
3638 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3639 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003640#endif
3641 } else {
3642 endinpos = s-starts;
3643 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003644 if (unicode_decode_call_errorhandler(
3645 errors, &errorHandler,
3646 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 &starts, &end, &startinpos, &endinpos, &exc, &s,
3648 &v, &outpos, &p))
3649 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 nextByte:
3652 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003654 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 Py_XDECREF(errorHandler);
3657 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 Py_XDECREF(errorHandler);
3663 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 return NULL;
3665}
3666
3667PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003670 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 char *p;
3672 char *q;
3673
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003674#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003675 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003676#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003677 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003678#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003679
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003680 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003683 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 if (repr == NULL)
3685 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003686 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003687 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003689 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 while (size-- > 0) {
3691 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003692#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 /* Map 32-bit characters to '\Uxxxxxxxx' */
3694 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003695 *p++ = '\\';
3696 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003697 *p++ = hexdigits[(ch >> 28) & 0xf];
3698 *p++ = hexdigits[(ch >> 24) & 0xf];
3699 *p++ = hexdigits[(ch >> 20) & 0xf];
3700 *p++ = hexdigits[(ch >> 16) & 0xf];
3701 *p++ = hexdigits[(ch >> 12) & 0xf];
3702 *p++ = hexdigits[(ch >> 8) & 0xf];
3703 *p++ = hexdigits[(ch >> 4) & 0xf];
3704 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003705 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003706 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003707#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3709 if (ch >= 0xD800 && ch < 0xDC00) {
3710 Py_UNICODE ch2;
3711 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003712
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 ch2 = *s++;
3714 size--;
3715 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3716 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3717 *p++ = '\\';
3718 *p++ = 'U';
3719 *p++ = hexdigits[(ucs >> 28) & 0xf];
3720 *p++ = hexdigits[(ucs >> 24) & 0xf];
3721 *p++ = hexdigits[(ucs >> 20) & 0xf];
3722 *p++ = hexdigits[(ucs >> 16) & 0xf];
3723 *p++ = hexdigits[(ucs >> 12) & 0xf];
3724 *p++ = hexdigits[(ucs >> 8) & 0xf];
3725 *p++ = hexdigits[(ucs >> 4) & 0xf];
3726 *p++ = hexdigits[ucs & 0xf];
3727 continue;
3728 }
3729 /* Fall through: isolated surrogates are copied as-is */
3730 s--;
3731 size++;
3732 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003733#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 /* Map 16-bit characters to '\uxxxx' */
3735 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 *p++ = '\\';
3737 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003738 *p++ = hexdigits[(ch >> 12) & 0xf];
3739 *p++ = hexdigits[(ch >> 8) & 0xf];
3740 *p++ = hexdigits[(ch >> 4) & 0xf];
3741 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 /* Copy everything else as-is */
3744 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 *p++ = (char) ch;
3746 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003747 size = p - q;
3748
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003749 assert(size > 0);
3750 if (_PyBytes_Resize(&repr, size) < 0)
3751 return NULL;
3752 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
3755PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3756{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003757 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003759 PyErr_BadArgument();
3760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003762 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3763 PyUnicode_GET_SIZE(unicode));
3764
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003765 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766}
3767
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003768/* --- Unicode Internal Codec ------------------------------------------- */
3769
3770PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 Py_ssize_t size,
3772 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003773{
3774 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003775 Py_ssize_t startinpos;
3776 Py_ssize_t endinpos;
3777 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003778 PyUnicodeObject *v;
3779 Py_UNICODE *p;
3780 const char *end;
3781 const char *reason;
3782 PyObject *errorHandler = NULL;
3783 PyObject *exc = NULL;
3784
Neal Norwitzd43069c2006-01-08 01:12:10 +00003785#ifdef Py_UNICODE_WIDE
3786 Py_UNICODE unimax = PyUnicode_GetMax();
3787#endif
3788
Thomas Wouters89f507f2006-12-13 04:49:30 +00003789 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003790 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3791 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003792 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003793 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003795 p = PyUnicode_AS_UNICODE(v);
3796 end = s + size;
3797
3798 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003799 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003800 /* We have to sanity check the raw data, otherwise doom looms for
3801 some malformed UCS-4 data. */
3802 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003803#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003804 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003805#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003806 end-s < Py_UNICODE_SIZE
3807 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003809 startinpos = s - starts;
3810 if (end-s < Py_UNICODE_SIZE) {
3811 endinpos = end-starts;
3812 reason = "truncated input";
3813 }
3814 else {
3815 endinpos = s - starts + Py_UNICODE_SIZE;
3816 reason = "illegal code point (> 0x10FFFF)";
3817 }
3818 outpos = p - PyUnicode_AS_UNICODE(v);
3819 if (unicode_decode_call_errorhandler(
3820 errors, &errorHandler,
3821 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003822 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003823 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003824 goto onError;
3825 }
3826 }
3827 else {
3828 p++;
3829 s += Py_UNICODE_SIZE;
3830 }
3831 }
3832
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003833 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003834 goto onError;
3835 Py_XDECREF(errorHandler);
3836 Py_XDECREF(exc);
3837 return (PyObject *)v;
3838
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003840 Py_XDECREF(v);
3841 Py_XDECREF(errorHandler);
3842 Py_XDECREF(exc);
3843 return NULL;
3844}
3845
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846/* --- Latin-1 Codec ------------------------------------------------------ */
3847
3848PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003849 Py_ssize_t size,
3850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851{
3852 PyUnicodeObject *v;
3853 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003854 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003857 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003858 Py_UNICODE r = *(unsigned char*)s;
3859 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003860 }
3861
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 v = _PyUnicode_New(size);
3863 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003868 e = s + size;
3869 /* Unrolling the copy makes it much faster by reducing the looping
3870 overhead. This is similar to what many memcpy() implementations do. */
3871 unrolled_end = e - 4;
3872 while (s < unrolled_end) {
3873 p[0] = (unsigned char) s[0];
3874 p[1] = (unsigned char) s[1];
3875 p[2] = (unsigned char) s[2];
3876 p[3] = (unsigned char) s[3];
3877 s += 4;
3878 p += 4;
3879 }
3880 while (s < e)
3881 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 Py_XDECREF(v);
3886 return NULL;
3887}
3888
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003889/* create or adjust a UnicodeEncodeError */
3890static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 const char *encoding,
3892 const Py_UNICODE *unicode, Py_ssize_t size,
3893 Py_ssize_t startpos, Py_ssize_t endpos,
3894 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 *exceptionObject = PyUnicodeEncodeError_Create(
3898 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 }
3900 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3902 goto onError;
3903 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3904 goto onError;
3905 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3906 goto onError;
3907 return;
3908 onError:
3909 Py_DECREF(*exceptionObject);
3910 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 }
3912}
3913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914/* raises a UnicodeEncodeError */
3915static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 const char *encoding,
3917 const Py_UNICODE *unicode, Py_ssize_t size,
3918 Py_ssize_t startpos, Py_ssize_t endpos,
3919 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920{
3921 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925}
3926
3927/* error handling callback helper:
3928 build arguments, call the callback and check the arguments,
3929 put the result into newpos and return the replacement string, which
3930 has to be freed by the caller */
3931static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 PyObject **errorHandler,
3933 const char *encoding, const char *reason,
3934 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3935 Py_ssize_t startpos, Py_ssize_t endpos,
3936 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003938 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939
3940 PyObject *restuple;
3941 PyObject *resunicode;
3942
3943 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003946 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 }
3948
3949 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953
3954 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003959 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 Py_DECREF(restuple);
3961 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003963 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 &resunicode, newpos)) {
3965 Py_DECREF(restuple);
3966 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003968 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
3969 PyErr_SetString(PyExc_TypeError, &argparse[3]);
3970 Py_DECREF(restuple);
3971 return NULL;
3972 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003975 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3977 Py_DECREF(restuple);
3978 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 Py_INCREF(resunicode);
3981 Py_DECREF(restuple);
3982 return resunicode;
3983}
3984
3985static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 Py_ssize_t size,
3987 const char *errors,
3988 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989{
3990 /* output object */
3991 PyObject *res;
3992 /* pointers to the beginning and end+1 of input */
3993 const Py_UNICODE *startp = p;
3994 const Py_UNICODE *endp = p + size;
3995 /* pointer to the beginning of the unencodable characters */
3996 /* const Py_UNICODE *badp = NULL; */
3997 /* pointer into the output */
3998 char *str;
3999 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004000 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004001 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4002 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 PyObject *errorHandler = NULL;
4004 PyObject *exc = NULL;
4005 /* the following variable is used for caching string comparisons
4006 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4007 int known_errorHandler = -1;
4008
4009 /* allocate enough for a simple encoding without
4010 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004011 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004012 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004013 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004015 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004016 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 ressize = size;
4018
4019 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004020 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 /* can we encode this? */
4023 if (c<limit) {
4024 /* no overflow check, because we know that the space is enough */
4025 *str++ = (char)c;
4026 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004027 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 else {
4029 Py_ssize_t unicodepos = p-startp;
4030 Py_ssize_t requiredsize;
4031 PyObject *repunicode;
4032 Py_ssize_t repsize;
4033 Py_ssize_t newpos;
4034 Py_ssize_t respos;
4035 Py_UNICODE *uni2;
4036 /* startpos for collecting unencodable chars */
4037 const Py_UNICODE *collstart = p;
4038 const Py_UNICODE *collend = p;
4039 /* find all unecodable characters */
4040 while ((collend < endp) && ((*collend)>=limit))
4041 ++collend;
4042 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4043 if (known_errorHandler==-1) {
4044 if ((errors==NULL) || (!strcmp(errors, "strict")))
4045 known_errorHandler = 1;
4046 else if (!strcmp(errors, "replace"))
4047 known_errorHandler = 2;
4048 else if (!strcmp(errors, "ignore"))
4049 known_errorHandler = 3;
4050 else if (!strcmp(errors, "xmlcharrefreplace"))
4051 known_errorHandler = 4;
4052 else
4053 known_errorHandler = 0;
4054 }
4055 switch (known_errorHandler) {
4056 case 1: /* strict */
4057 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4058 goto onError;
4059 case 2: /* replace */
4060 while (collstart++<collend)
4061 *str++ = '?'; /* fall through */
4062 case 3: /* ignore */
4063 p = collend;
4064 break;
4065 case 4: /* xmlcharrefreplace */
4066 respos = str - PyBytes_AS_STRING(res);
4067 /* determine replacement size (temporarily (mis)uses p) */
4068 for (p = collstart, repsize = 0; p < collend; ++p) {
4069 if (*p<10)
4070 repsize += 2+1+1;
4071 else if (*p<100)
4072 repsize += 2+2+1;
4073 else if (*p<1000)
4074 repsize += 2+3+1;
4075 else if (*p<10000)
4076 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004077#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 else
4079 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004080#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 else if (*p<100000)
4082 repsize += 2+5+1;
4083 else if (*p<1000000)
4084 repsize += 2+6+1;
4085 else
4086 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004087#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 }
4089 requiredsize = respos+repsize+(endp-collend);
4090 if (requiredsize > ressize) {
4091 if (requiredsize<2*ressize)
4092 requiredsize = 2*ressize;
4093 if (_PyBytes_Resize(&res, requiredsize))
4094 goto onError;
4095 str = PyBytes_AS_STRING(res) + respos;
4096 ressize = requiredsize;
4097 }
4098 /* generate replacement (temporarily (mis)uses p) */
4099 for (p = collstart; p < collend; ++p) {
4100 str += sprintf(str, "&#%d;", (int)*p);
4101 }
4102 p = collend;
4103 break;
4104 default:
4105 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4106 encoding, reason, startp, size, &exc,
4107 collstart-startp, collend-startp, &newpos);
4108 if (repunicode == NULL)
4109 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004110 if (!PyUnicode_Check(repunicode)) {
4111 /* Implementation limitation: byte results not supported yet. */
4112 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
4113 Py_DECREF(repunicode);
4114 goto onError;
4115 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 /* need more space? (at least enough for what we
4117 have+the replacement+the rest of the string, so
4118 we won't have to check space for encodable characters) */
4119 respos = str - PyBytes_AS_STRING(res);
4120 repsize = PyUnicode_GET_SIZE(repunicode);
4121 requiredsize = respos+repsize+(endp-collend);
4122 if (requiredsize > ressize) {
4123 if (requiredsize<2*ressize)
4124 requiredsize = 2*ressize;
4125 if (_PyBytes_Resize(&res, requiredsize)) {
4126 Py_DECREF(repunicode);
4127 goto onError;
4128 }
4129 str = PyBytes_AS_STRING(res) + respos;
4130 ressize = requiredsize;
4131 }
4132 /* check if there is anything unencodable in the replacement
4133 and copy it to the output */
4134 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4135 c = *uni2;
4136 if (c >= limit) {
4137 raise_encode_exception(&exc, encoding, startp, size,
4138 unicodepos, unicodepos+1, reason);
4139 Py_DECREF(repunicode);
4140 goto onError;
4141 }
4142 *str = (char)c;
4143 }
4144 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004145 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004146 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004147 }
4148 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004149 /* Resize if we allocated to much */
4150 size = str - PyBytes_AS_STRING(res);
4151 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004152 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004153 if (_PyBytes_Resize(&res, size) < 0)
4154 goto onError;
4155 }
4156
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 Py_XDECREF(errorHandler);
4158 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004159 return res;
4160
4161 onError:
4162 Py_XDECREF(res);
4163 Py_XDECREF(errorHandler);
4164 Py_XDECREF(exc);
4165 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166}
4167
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 Py_ssize_t size,
4170 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173}
4174
4175PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4176{
4177 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 PyErr_BadArgument();
4179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 }
4181 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 PyUnicode_GET_SIZE(unicode),
4183 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186/* --- 7-bit ASCII Codec -------------------------------------------------- */
4187
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 Py_ssize_t size,
4190 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 PyUnicodeObject *v;
4194 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004195 Py_ssize_t startinpos;
4196 Py_ssize_t endinpos;
4197 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 const char *e;
4199 PyObject *errorHandler = NULL;
4200 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004201
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004203 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 Py_UNICODE r = *(unsigned char*)s;
4205 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004206 }
Tim Petersced69f82003-09-16 20:30:58 +00004207
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 v = _PyUnicode_New(size);
4209 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 e = s + size;
4215 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 register unsigned char c = (unsigned char)*s;
4217 if (c < 128) {
4218 *p++ = c;
4219 ++s;
4220 }
4221 else {
4222 startinpos = s-starts;
4223 endinpos = startinpos + 1;
4224 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4225 if (unicode_decode_call_errorhandler(
4226 errors, &errorHandler,
4227 "ascii", "ordinal not in range(128)",
4228 &starts, &e, &startinpos, &endinpos, &exc, &s,
4229 &v, &outpos, &p))
4230 goto onError;
4231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004233 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4235 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 Py_XDECREF(errorHandler);
4237 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004239
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 Py_XDECREF(errorHandler);
4243 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 return NULL;
4245}
4246
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 Py_ssize_t size,
4249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252}
4253
4254PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4255{
4256 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 PyErr_BadArgument();
4258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 }
4260 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 PyUnicode_GET_SIZE(unicode),
4262 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263}
4264
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004265#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004266
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004267/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004268
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004269#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004270#define NEED_RETRY
4271#endif
4272
4273/* XXX This code is limited to "true" double-byte encodings, as
4274 a) it assumes an incomplete character consists of a single byte, and
4275 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004277
4278static int is_dbcs_lead_byte(const char *s, int offset)
4279{
4280 const char *curr = s + offset;
4281
4282 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 const char *prev = CharPrev(s, curr);
4284 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004285 }
4286 return 0;
4287}
4288
4289/*
4290 * Decode MBCS string into unicode object. If 'final' is set, converts
4291 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4292 */
4293static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 const char *s, /* MBCS string */
4295 int size, /* sizeof MBCS string */
4296 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004297{
4298 Py_UNICODE *p;
4299 Py_ssize_t n = 0;
4300 int usize = 0;
4301
4302 assert(size >= 0);
4303
4304 /* Skip trailing lead-byte unless 'final' is set */
4305 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004307
4308 /* First get the size of the result */
4309 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4311 if (usize == 0) {
4312 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4313 return -1;
4314 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004315 }
4316
4317 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 /* Create unicode object */
4319 *v = _PyUnicode_New(usize);
4320 if (*v == NULL)
4321 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004322 }
4323 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 /* Extend unicode object */
4325 n = PyUnicode_GET_SIZE(*v);
4326 if (_PyUnicode_Resize(v, n + usize) < 0)
4327 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004328 }
4329
4330 /* Do the conversion */
4331 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 p = PyUnicode_AS_UNICODE(*v) + n;
4333 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4334 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4335 return -1;
4336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004337 }
4338
4339 return size;
4340}
4341
4342PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 Py_ssize_t size,
4344 const char *errors,
4345 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004346{
4347 PyUnicodeObject *v = NULL;
4348 int done;
4349
4350 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004352
4353#ifdef NEED_RETRY
4354 retry:
4355 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004357 else
4358#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004360
4361 if (done < 0) {
4362 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004364 }
4365
4366 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004368
4369#ifdef NEED_RETRY
4370 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 s += done;
4372 size -= done;
4373 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004374 }
4375#endif
4376
4377 return (PyObject *)v;
4378}
4379
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004380PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 Py_ssize_t size,
4382 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004383{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004384 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4385}
4386
4387/*
4388 * Convert unicode into string object (MBCS).
4389 * Returns 0 if succeed, -1 otherwise.
4390 */
4391static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 const Py_UNICODE *p, /* unicode */
4393 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004394{
4395 int mbcssize = 0;
4396 Py_ssize_t n = 0;
4397
4398 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004399
4400 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004401 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4403 if (mbcssize == 0) {
4404 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4405 return -1;
4406 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004407 }
4408
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004409 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 /* Create string object */
4411 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4412 if (*repr == NULL)
4413 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004414 }
4415 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 /* Extend string object */
4417 n = PyBytes_Size(*repr);
4418 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4419 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004420 }
4421
4422 /* Do the conversion */
4423 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 char *s = PyBytes_AS_STRING(*repr) + n;
4425 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4426 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4427 return -1;
4428 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004429 }
4430
4431 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004432}
4433
4434PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 Py_ssize_t size,
4436 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004438 PyObject *repr = NULL;
4439 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004440
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004441#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004443 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004445 else
4446#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004448
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004449 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 Py_XDECREF(repr);
4451 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004452 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004453
4454#ifdef NEED_RETRY
4455 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 p += INT_MAX;
4457 size -= INT_MAX;
4458 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004459 }
4460#endif
4461
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004462 return repr;
4463}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004464
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004465PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4466{
4467 if (!PyUnicode_Check(unicode)) {
4468 PyErr_BadArgument();
4469 return NULL;
4470 }
4471 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 PyUnicode_GET_SIZE(unicode),
4473 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004474}
4475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004476#undef NEED_RETRY
4477
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004478#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004479
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480/* --- Character Mapping Codec -------------------------------------------- */
4481
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 Py_ssize_t size,
4484 PyObject *mapping,
4485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t startinpos;
4489 Py_ssize_t endinpos;
4490 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 PyUnicodeObject *v;
4493 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004494 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 PyObject *errorHandler = NULL;
4496 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004497 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004498 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 /* Default to Latin-1 */
4501 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503
4504 v = _PyUnicode_New(size);
4505 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004511 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 mapstring = PyUnicode_AS_UNICODE(mapping);
4513 maplen = PyUnicode_GET_SIZE(mapping);
4514 while (s < e) {
4515 unsigned char ch = *s;
4516 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 if (ch < maplen)
4519 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 if (x == 0xfffe) {
4522 /* undefined mapping */
4523 outpos = p-PyUnicode_AS_UNICODE(v);
4524 startinpos = s-starts;
4525 endinpos = startinpos+1;
4526 if (unicode_decode_call_errorhandler(
4527 errors, &errorHandler,
4528 "charmap", "character maps to <undefined>",
4529 &starts, &e, &startinpos, &endinpos, &exc, &s,
4530 &v, &outpos, &p)) {
4531 goto onError;
4532 }
4533 continue;
4534 }
4535 *p++ = x;
4536 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004537 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004538 }
4539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 while (s < e) {
4541 unsigned char ch = *s;
4542 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004543
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4545 w = PyLong_FromLong((long)ch);
4546 if (w == NULL)
4547 goto onError;
4548 x = PyObject_GetItem(mapping, w);
4549 Py_DECREF(w);
4550 if (x == NULL) {
4551 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4552 /* No mapping found means: mapping is undefined. */
4553 PyErr_Clear();
4554 x = Py_None;
4555 Py_INCREF(x);
4556 } else
4557 goto onError;
4558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004559
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 /* Apply mapping */
4561 if (PyLong_Check(x)) {
4562 long value = PyLong_AS_LONG(x);
4563 if (value < 0 || value > 65535) {
4564 PyErr_SetString(PyExc_TypeError,
4565 "character mapping must be in range(65536)");
4566 Py_DECREF(x);
4567 goto onError;
4568 }
4569 *p++ = (Py_UNICODE)value;
4570 }
4571 else if (x == Py_None) {
4572 /* undefined mapping */
4573 outpos = p-PyUnicode_AS_UNICODE(v);
4574 startinpos = s-starts;
4575 endinpos = startinpos+1;
4576 if (unicode_decode_call_errorhandler(
4577 errors, &errorHandler,
4578 "charmap", "character maps to <undefined>",
4579 &starts, &e, &startinpos, &endinpos, &exc, &s,
4580 &v, &outpos, &p)) {
4581 Py_DECREF(x);
4582 goto onError;
4583 }
4584 Py_DECREF(x);
4585 continue;
4586 }
4587 else if (PyUnicode_Check(x)) {
4588 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004589
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 if (targetsize == 1)
4591 /* 1-1 mapping */
4592 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004593
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 else if (targetsize > 1) {
4595 /* 1-n mapping */
4596 if (targetsize > extrachars) {
4597 /* resize first */
4598 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4599 Py_ssize_t needed = (targetsize - extrachars) + \
4600 (targetsize << 2);
4601 extrachars += needed;
4602 /* XXX overflow detection missing */
4603 if (_PyUnicode_Resize(&v,
4604 PyUnicode_GET_SIZE(v) + needed) < 0) {
4605 Py_DECREF(x);
4606 goto onError;
4607 }
4608 p = PyUnicode_AS_UNICODE(v) + oldpos;
4609 }
4610 Py_UNICODE_COPY(p,
4611 PyUnicode_AS_UNICODE(x),
4612 targetsize);
4613 p += targetsize;
4614 extrachars -= targetsize;
4615 }
4616 /* 1-0 mapping: skip the character */
4617 }
4618 else {
4619 /* wrong return value */
4620 PyErr_SetString(PyExc_TypeError,
4621 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004622 Py_DECREF(x);
4623 goto onError;
4624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 Py_DECREF(x);
4626 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 }
4629 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4631 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 Py_XDECREF(errorHandler);
4633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004635
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 Py_XDECREF(errorHandler);
4638 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 Py_XDECREF(v);
4640 return NULL;
4641}
4642
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004643/* Charmap encoding: the lookup table */
4644
4645struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 PyObject_HEAD
4647 unsigned char level1[32];
4648 int count2, count3;
4649 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004650};
4651
4652static PyObject*
4653encoding_map_size(PyObject *obj, PyObject* args)
4654{
4655 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004656 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004658}
4659
4660static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004661 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 PyDoc_STR("Return the size (in bytes) of this object") },
4663 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004664};
4665
4666static void
4667encoding_map_dealloc(PyObject* o)
4668{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004669 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004670}
4671
4672static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004673 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 "EncodingMap", /*tp_name*/
4675 sizeof(struct encoding_map), /*tp_basicsize*/
4676 0, /*tp_itemsize*/
4677 /* methods */
4678 encoding_map_dealloc, /*tp_dealloc*/
4679 0, /*tp_print*/
4680 0, /*tp_getattr*/
4681 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004682 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 0, /*tp_repr*/
4684 0, /*tp_as_number*/
4685 0, /*tp_as_sequence*/
4686 0, /*tp_as_mapping*/
4687 0, /*tp_hash*/
4688 0, /*tp_call*/
4689 0, /*tp_str*/
4690 0, /*tp_getattro*/
4691 0, /*tp_setattro*/
4692 0, /*tp_as_buffer*/
4693 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4694 0, /*tp_doc*/
4695 0, /*tp_traverse*/
4696 0, /*tp_clear*/
4697 0, /*tp_richcompare*/
4698 0, /*tp_weaklistoffset*/
4699 0, /*tp_iter*/
4700 0, /*tp_iternext*/
4701 encoding_map_methods, /*tp_methods*/
4702 0, /*tp_members*/
4703 0, /*tp_getset*/
4704 0, /*tp_base*/
4705 0, /*tp_dict*/
4706 0, /*tp_descr_get*/
4707 0, /*tp_descr_set*/
4708 0, /*tp_dictoffset*/
4709 0, /*tp_init*/
4710 0, /*tp_alloc*/
4711 0, /*tp_new*/
4712 0, /*tp_free*/
4713 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004714};
4715
4716PyObject*
4717PyUnicode_BuildEncodingMap(PyObject* string)
4718{
4719 Py_UNICODE *decode;
4720 PyObject *result;
4721 struct encoding_map *mresult;
4722 int i;
4723 int need_dict = 0;
4724 unsigned char level1[32];
4725 unsigned char level2[512];
4726 unsigned char *mlevel1, *mlevel2, *mlevel3;
4727 int count2 = 0, count3 = 0;
4728
4729 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
4733 decode = PyUnicode_AS_UNICODE(string);
4734 memset(level1, 0xFF, sizeof level1);
4735 memset(level2, 0xFF, sizeof level2);
4736
4737 /* If there isn't a one-to-one mapping of NULL to \0,
4738 or if there are non-BMP characters, we need to use
4739 a mapping dictionary. */
4740 if (decode[0] != 0)
4741 need_dict = 1;
4742 for (i = 1; i < 256; i++) {
4743 int l1, l2;
4744 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004745#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004746 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004747#endif
4748 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004749 need_dict = 1;
4750 break;
4751 }
4752 if (decode[i] == 0xFFFE)
4753 /* unmapped character */
4754 continue;
4755 l1 = decode[i] >> 11;
4756 l2 = decode[i] >> 7;
4757 if (level1[l1] == 0xFF)
4758 level1[l1] = count2++;
4759 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004760 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004761 }
4762
4763 if (count2 >= 0xFF || count3 >= 0xFF)
4764 need_dict = 1;
4765
4766 if (need_dict) {
4767 PyObject *result = PyDict_New();
4768 PyObject *key, *value;
4769 if (!result)
4770 return NULL;
4771 for (i = 0; i < 256; i++) {
4772 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004773 key = PyLong_FromLong(decode[i]);
4774 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004775 if (!key || !value)
4776 goto failed1;
4777 if (PyDict_SetItem(result, key, value) == -1)
4778 goto failed1;
4779 Py_DECREF(key);
4780 Py_DECREF(value);
4781 }
4782 return result;
4783 failed1:
4784 Py_XDECREF(key);
4785 Py_XDECREF(value);
4786 Py_DECREF(result);
4787 return NULL;
4788 }
4789
4790 /* Create a three-level trie */
4791 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4792 16*count2 + 128*count3 - 1);
4793 if (!result)
4794 return PyErr_NoMemory();
4795 PyObject_Init(result, &EncodingMapType);
4796 mresult = (struct encoding_map*)result;
4797 mresult->count2 = count2;
4798 mresult->count3 = count3;
4799 mlevel1 = mresult->level1;
4800 mlevel2 = mresult->level23;
4801 mlevel3 = mresult->level23 + 16*count2;
4802 memcpy(mlevel1, level1, 32);
4803 memset(mlevel2, 0xFF, 16*count2);
4804 memset(mlevel3, 0, 128*count3);
4805 count3 = 0;
4806 for (i = 1; i < 256; i++) {
4807 int o1, o2, o3, i2, i3;
4808 if (decode[i] == 0xFFFE)
4809 /* unmapped character */
4810 continue;
4811 o1 = decode[i]>>11;
4812 o2 = (decode[i]>>7) & 0xF;
4813 i2 = 16*mlevel1[o1] + o2;
4814 if (mlevel2[i2] == 0xFF)
4815 mlevel2[i2] = count3++;
4816 o3 = decode[i] & 0x7F;
4817 i3 = 128*mlevel2[i2] + o3;
4818 mlevel3[i3] = i;
4819 }
4820 return result;
4821}
4822
4823static int
4824encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4825{
4826 struct encoding_map *map = (struct encoding_map*)mapping;
4827 int l1 = c>>11;
4828 int l2 = (c>>7) & 0xF;
4829 int l3 = c & 0x7F;
4830 int i;
4831
4832#ifdef Py_UNICODE_WIDE
4833 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004835 }
4836#endif
4837 if (c == 0)
4838 return 0;
4839 /* level 1*/
4840 i = map->level1[l1];
4841 if (i == 0xFF) {
4842 return -1;
4843 }
4844 /* level 2*/
4845 i = map->level23[16*i+l2];
4846 if (i == 0xFF) {
4847 return -1;
4848 }
4849 /* level 3 */
4850 i = map->level23[16*map->count2 + 128*i + l3];
4851 if (i == 0) {
4852 return -1;
4853 }
4854 return i;
4855}
4856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857/* Lookup the character ch in the mapping. If the character
4858 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004859 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861{
Christian Heimes217cfd12007-12-02 14:31:20 +00004862 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 PyObject *x;
4864
4865 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 x = PyObject_GetItem(mapping, w);
4868 Py_DECREF(w);
4869 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4871 /* No mapping found means: mapping is undefined. */
4872 PyErr_Clear();
4873 x = Py_None;
4874 Py_INCREF(x);
4875 return x;
4876 } else
4877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004879 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004881 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 long value = PyLong_AS_LONG(x);
4883 if (value < 0 || value > 255) {
4884 PyErr_SetString(PyExc_TypeError,
4885 "character mapping must be in range(256)");
4886 Py_DECREF(x);
4887 return NULL;
4888 }
4889 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004891 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 /* wrong return value */
4895 PyErr_Format(PyExc_TypeError,
4896 "character mapping must return integer, bytes or None, not %.400s",
4897 x->ob_type->tp_name);
4898 Py_DECREF(x);
4899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 }
4901}
4902
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004903static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004904charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004905{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004906 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4907 /* exponentially overallocate to minimize reallocations */
4908 if (requiredsize < 2*outsize)
4909 requiredsize = 2*outsize;
4910 if (_PyBytes_Resize(outobj, requiredsize))
4911 return -1;
4912 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004913}
4914
Benjamin Peterson14339b62009-01-31 16:36:08 +00004915typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004917}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004919 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 space is available. Return a new reference to the object that
4921 was put in the output buffer, or Py_None, if the mapping was undefined
4922 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004923 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004925charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004928 PyObject *rep;
4929 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004930 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931
Christian Heimes90aa7642007-12-19 02:45:37 +00004932 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004933 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004935 if (res == -1)
4936 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (outsize<requiredsize)
4938 if (charmapencode_resize(outobj, outpos, requiredsize))
4939 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004940 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 outstart[(*outpos)++] = (char)res;
4942 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004943 }
4944
4945 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004948 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 Py_DECREF(rep);
4950 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004951 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 if (PyLong_Check(rep)) {
4953 Py_ssize_t requiredsize = *outpos+1;
4954 if (outsize<requiredsize)
4955 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4956 Py_DECREF(rep);
4957 return enc_EXCEPTION;
4958 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004959 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 else {
4963 const char *repchars = PyBytes_AS_STRING(rep);
4964 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4965 Py_ssize_t requiredsize = *outpos+repsize;
4966 if (outsize<requiredsize)
4967 if (charmapencode_resize(outobj, outpos, requiredsize)) {
4968 Py_DECREF(rep);
4969 return enc_EXCEPTION;
4970 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004971 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 memcpy(outstart + *outpos, repchars, repsize);
4973 *outpos += repsize;
4974 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004976 Py_DECREF(rep);
4977 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978}
4979
4980/* handle an error in PyUnicode_EncodeCharmap
4981 Return 0 on success, -1 on error */
4982static
4983int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004984 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004986 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004987 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988{
4989 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t repsize;
4991 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 Py_UNICODE *uni2;
4993 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004994 Py_ssize_t collstartpos = *inpos;
4995 Py_ssize_t collendpos = *inpos+1;
4996 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997 char *encoding = "charmap";
4998 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004999 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 /* find all unencodable characters */
5002 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005003 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005004 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 int res = encoding_map_lookup(p[collendpos], mapping);
5006 if (res != -1)
5007 break;
5008 ++collendpos;
5009 continue;
5010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005011
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 rep = charmapencode_lookup(p[collendpos], mapping);
5013 if (rep==NULL)
5014 return -1;
5015 else if (rep!=Py_None) {
5016 Py_DECREF(rep);
5017 break;
5018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 }
5022 /* cache callback name lookup
5023 * (if not done yet, i.e. it's the first error) */
5024 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 if ((errors==NULL) || (!strcmp(errors, "strict")))
5026 *known_errorHandler = 1;
5027 else if (!strcmp(errors, "replace"))
5028 *known_errorHandler = 2;
5029 else if (!strcmp(errors, "ignore"))
5030 *known_errorHandler = 3;
5031 else if (!strcmp(errors, "xmlcharrefreplace"))
5032 *known_errorHandler = 4;
5033 else
5034 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035 }
5036 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005037 case 1: /* strict */
5038 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5039 return -1;
5040 case 2: /* replace */
5041 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 x = charmapencode_output('?', mapping, res, respos);
5043 if (x==enc_EXCEPTION) {
5044 return -1;
5045 }
5046 else if (x==enc_FAILED) {
5047 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5048 return -1;
5049 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005050 }
5051 /* fall through */
5052 case 3: /* ignore */
5053 *inpos = collendpos;
5054 break;
5055 case 4: /* xmlcharrefreplace */
5056 /* generate replacement (temporarily (mis)uses p) */
5057 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 char buffer[2+29+1+1];
5059 char *cp;
5060 sprintf(buffer, "&#%d;", (int)p[collpos]);
5061 for (cp = buffer; *cp; ++cp) {
5062 x = charmapencode_output(*cp, mapping, res, respos);
5063 if (x==enc_EXCEPTION)
5064 return -1;
5065 else if (x==enc_FAILED) {
5066 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5067 return -1;
5068 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005069 }
5070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005071 *inpos = collendpos;
5072 break;
5073 default:
5074 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 encoding, reason, p, size, exceptionObject,
5076 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005077 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 return -1;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005079 if (!PyUnicode_Check(repunicode)) {
5080 /* Implementation limitation: byte results not supported yet. */
5081 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5082 Py_DECREF(repunicode);
5083 return -1;
5084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005085 /* generate replacement */
5086 repsize = PyUnicode_GET_SIZE(repunicode);
5087 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 x = charmapencode_output(*uni2, mapping, res, respos);
5089 if (x==enc_EXCEPTION) {
5090 return -1;
5091 }
5092 else if (x==enc_FAILED) {
5093 Py_DECREF(repunicode);
5094 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5095 return -1;
5096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005097 }
5098 *inpos = newpos;
5099 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 }
5101 return 0;
5102}
5103
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 Py_ssize_t size,
5106 PyObject *mapping,
5107 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 /* output object */
5110 PyObject *res = NULL;
5111 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005112 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005114 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 PyObject *errorHandler = NULL;
5116 PyObject *exc = NULL;
5117 /* the following variable is used for caching string comparisons
5118 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5119 * 3=ignore, 4=xmlcharrefreplace */
5120 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
5122 /* Default to Latin-1 */
5123 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 /* allocate enough for a simple encoding without
5127 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005128 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 if (res == NULL)
5130 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005131 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 /* try to encode it */
5136 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5137 if (x==enc_EXCEPTION) /* error */
5138 goto onError;
5139 if (x==enc_FAILED) { /* unencodable character */
5140 if (charmap_encoding_error(p, size, &inpos, mapping,
5141 &exc,
5142 &known_errorHandler, &errorHandler, errors,
5143 &res, &respos)) {
5144 goto onError;
5145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 else
5148 /* done with this character => adjust input position */
5149 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005153 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005154 if (_PyBytes_Resize(&res, respos) < 0)
5155 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157 Py_XDECREF(exc);
5158 Py_XDECREF(errorHandler);
5159 return res;
5160
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 Py_XDECREF(res);
5163 Py_XDECREF(exc);
5164 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 return NULL;
5166}
5167
5168PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
5171 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 PyErr_BadArgument();
5173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 }
5175 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 PyUnicode_GET_SIZE(unicode),
5177 mapping,
5178 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179}
5180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181/* create or adjust a UnicodeTranslateError */
5182static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 const Py_UNICODE *unicode, Py_ssize_t size,
5184 Py_ssize_t startpos, Py_ssize_t endpos,
5185 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005188 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 }
5191 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5193 goto onError;
5194 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5195 goto onError;
5196 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5197 goto onError;
5198 return;
5199 onError:
5200 Py_DECREF(*exceptionObject);
5201 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 }
5203}
5204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205/* raises a UnicodeTranslateError */
5206static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 const Py_UNICODE *unicode, Py_ssize_t size,
5208 Py_ssize_t startpos, Py_ssize_t endpos,
5209 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210{
5211 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215}
5216
5217/* error handling callback helper:
5218 build arguments, call the callback and check the arguments,
5219 put the result into newpos and return the replacement string, which
5220 has to be freed by the caller */
5221static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 PyObject **errorHandler,
5223 const char *reason,
5224 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5225 Py_ssize_t startpos, Py_ssize_t endpos,
5226 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005228 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005230 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 PyObject *restuple;
5232 PyObject *resunicode;
5233
5234 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 }
5239
5240 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244
5245 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005250 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 Py_DECREF(restuple);
5252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 }
5254 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 &resunicode, &i_newpos)) {
5256 Py_DECREF(restuple);
5257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005258 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005259 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005261 else
5262 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005263 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5265 Py_DECREF(restuple);
5266 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 Py_INCREF(resunicode);
5269 Py_DECREF(restuple);
5270 return resunicode;
5271}
5272
5273/* Lookup the character ch in the mapping and put the result in result,
5274 which must be decrefed by the caller.
5275 Return 0 on success, -1 on error */
5276static
5277int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5278{
Christian Heimes217cfd12007-12-02 14:31:20 +00005279 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 PyObject *x;
5281
5282 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284 x = PyObject_GetItem(mapping, w);
5285 Py_DECREF(w);
5286 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5288 /* No mapping found means: use 1:1 mapping. */
5289 PyErr_Clear();
5290 *result = NULL;
5291 return 0;
5292 } else
5293 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 }
5295 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 *result = x;
5297 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005299 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 long value = PyLong_AS_LONG(x);
5301 long max = PyUnicode_GetMax();
5302 if (value < 0 || value > max) {
5303 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005304 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 Py_DECREF(x);
5306 return -1;
5307 }
5308 *result = x;
5309 return 0;
5310 }
5311 else if (PyUnicode_Check(x)) {
5312 *result = x;
5313 return 0;
5314 }
5315 else {
5316 /* wrong return value */
5317 PyErr_SetString(PyExc_TypeError,
5318 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005319 Py_DECREF(x);
5320 return -1;
5321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322}
5323/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 if not reallocate and adjust various state variables.
5325 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005326static
Walter Dörwald4894c302003-10-24 14:25:28 +00005327int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005331 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 /* remember old output position */
5333 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5334 /* exponentially overallocate to minimize reallocations */
5335 if (requiredsize < 2 * oldsize)
5336 requiredsize = 2 * oldsize;
5337 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5338 return -1;
5339 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 }
5341 return 0;
5342}
5343/* lookup the character, put the result in the output string and adjust
5344 various state variables. Return a new reference to the object that
5345 was put in the output buffer in *result, or Py_None, if the mapping was
5346 undefined (in which case no character was written).
5347 The called must decref result.
5348 Return 0 on success, -1 on error. */
5349static
Walter Dörwald4894c302003-10-24 14:25:28 +00005350int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5352 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353{
Walter Dörwald4894c302003-10-24 14:25:28 +00005354 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 /* not found => default to 1:1 mapping */
5358 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 }
5360 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005362 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 /* no overflow check, because we know that the space is enough */
5364 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 }
5366 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5368 if (repsize==1) {
5369 /* no overflow check, because we know that the space is enough */
5370 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5371 }
5372 else if (repsize!=0) {
5373 /* more than one character */
5374 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5375 (insize - (curinp-startinp)) +
5376 repsize - 1;
5377 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5378 return -1;
5379 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5380 *outp += repsize;
5381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 }
5383 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 return 0;
5386}
5387
5388PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 Py_ssize_t size,
5390 PyObject *mapping,
5391 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 /* output object */
5394 PyObject *res = NULL;
5395 /* pointers to the beginning and end+1 of input */
5396 const Py_UNICODE *startp = p;
5397 const Py_UNICODE *endp = p + size;
5398 /* pointer into the output */
5399 Py_UNICODE *str;
5400 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402 char *reason = "character maps to <undefined>";
5403 PyObject *errorHandler = NULL;
5404 PyObject *exc = NULL;
5405 /* the following variable is used for caching string comparisons
5406 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5407 * 3=ignore, 4=xmlcharrefreplace */
5408 int known_errorHandler = -1;
5409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 PyErr_BadArgument();
5412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414
5415 /* allocate enough for a simple 1:1 translation without
5416 replacements, if we need more, we'll resize */
5417 res = PyUnicode_FromUnicode(NULL, size);
5418 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005424 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 /* try to encode it */
5426 PyObject *x = NULL;
5427 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5428 Py_XDECREF(x);
5429 goto onError;
5430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005431 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 if (x!=Py_None) /* it worked => adjust input pointer */
5433 ++p;
5434 else { /* untranslatable character */
5435 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5436 Py_ssize_t repsize;
5437 Py_ssize_t newpos;
5438 Py_UNICODE *uni2;
5439 /* startpos for collecting untranslatable chars */
5440 const Py_UNICODE *collstart = p;
5441 const Py_UNICODE *collend = p+1;
5442 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 /* find all untranslatable characters */
5445 while (collend < endp) {
5446 if (charmaptranslate_lookup(*collend, mapping, &x))
5447 goto onError;
5448 Py_XDECREF(x);
5449 if (x!=Py_None)
5450 break;
5451 ++collend;
5452 }
5453 /* cache callback name lookup
5454 * (if not done yet, i.e. it's the first error) */
5455 if (known_errorHandler==-1) {
5456 if ((errors==NULL) || (!strcmp(errors, "strict")))
5457 known_errorHandler = 1;
5458 else if (!strcmp(errors, "replace"))
5459 known_errorHandler = 2;
5460 else if (!strcmp(errors, "ignore"))
5461 known_errorHandler = 3;
5462 else if (!strcmp(errors, "xmlcharrefreplace"))
5463 known_errorHandler = 4;
5464 else
5465 known_errorHandler = 0;
5466 }
5467 switch (known_errorHandler) {
5468 case 1: /* strict */
5469 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005470 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 case 2: /* replace */
5472 /* No need to check for space, this is a 1:1 replacement */
5473 for (coll = collstart; coll<collend; ++coll)
5474 *str++ = '?';
5475 /* fall through */
5476 case 3: /* ignore */
5477 p = collend;
5478 break;
5479 case 4: /* xmlcharrefreplace */
5480 /* generate replacement (temporarily (mis)uses p) */
5481 for (p = collstart; p < collend; ++p) {
5482 char buffer[2+29+1+1];
5483 char *cp;
5484 sprintf(buffer, "&#%d;", (int)*p);
5485 if (charmaptranslate_makespace(&res, &str,
5486 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5487 goto onError;
5488 for (cp = buffer; *cp; ++cp)
5489 *str++ = *cp;
5490 }
5491 p = collend;
5492 break;
5493 default:
5494 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5495 reason, startp, size, &exc,
5496 collstart-startp, collend-startp, &newpos);
5497 if (repunicode == NULL)
5498 goto onError;
5499 /* generate replacement */
5500 repsize = PyUnicode_GET_SIZE(repunicode);
5501 if (charmaptranslate_makespace(&res, &str,
5502 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5503 Py_DECREF(repunicode);
5504 goto onError;
5505 }
5506 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5507 *str++ = *uni2;
5508 p = startp + newpos;
5509 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005511 }
5512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 /* Resize if we allocated to much */
5514 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005515 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 if (PyUnicode_Resize(&res, respos) < 0)
5517 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 }
5519 Py_XDECREF(exc);
5520 Py_XDECREF(errorHandler);
5521 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 Py_XDECREF(res);
5525 Py_XDECREF(exc);
5526 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 return NULL;
5528}
5529
5530PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 PyObject *mapping,
5532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
5534 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005535
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 str = PyUnicode_FromObject(str);
5537 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 PyUnicode_GET_SIZE(str),
5541 mapping,
5542 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 Py_DECREF(str);
5544 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005545
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 Py_XDECREF(str);
5548 return NULL;
5549}
Tim Petersced69f82003-09-16 20:30:58 +00005550
Guido van Rossum9e896b32000-04-05 20:11:21 +00005551/* --- Decimal Encoder ---------------------------------------------------- */
5552
5553int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t length,
5555 char *output,
5556 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005557{
5558 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 PyObject *errorHandler = NULL;
5560 PyObject *exc = NULL;
5561 const char *encoding = "decimal";
5562 const char *reason = "invalid decimal Unicode string";
5563 /* the following variable is used for caching string comparisons
5564 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5565 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005566
5567 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 PyErr_BadArgument();
5569 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005570 }
5571
5572 p = s;
5573 end = s + length;
5574 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 register Py_UNICODE ch = *p;
5576 int decimal;
5577 PyObject *repunicode;
5578 Py_ssize_t repsize;
5579 Py_ssize_t newpos;
5580 Py_UNICODE *uni2;
5581 Py_UNICODE *collstart;
5582 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005583
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005585 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 ++p;
5587 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 decimal = Py_UNICODE_TODECIMAL(ch);
5590 if (decimal >= 0) {
5591 *output++ = '0' + decimal;
5592 ++p;
5593 continue;
5594 }
5595 if (0 < ch && ch < 256) {
5596 *output++ = (char)ch;
5597 ++p;
5598 continue;
5599 }
5600 /* All other characters are considered unencodable */
5601 collstart = p;
5602 collend = p+1;
5603 while (collend < end) {
5604 if ((0 < *collend && *collend < 256) ||
5605 !Py_UNICODE_ISSPACE(*collend) ||
5606 Py_UNICODE_TODECIMAL(*collend))
5607 break;
5608 }
5609 /* cache callback name lookup
5610 * (if not done yet, i.e. it's the first error) */
5611 if (known_errorHandler==-1) {
5612 if ((errors==NULL) || (!strcmp(errors, "strict")))
5613 known_errorHandler = 1;
5614 else if (!strcmp(errors, "replace"))
5615 known_errorHandler = 2;
5616 else if (!strcmp(errors, "ignore"))
5617 known_errorHandler = 3;
5618 else if (!strcmp(errors, "xmlcharrefreplace"))
5619 known_errorHandler = 4;
5620 else
5621 known_errorHandler = 0;
5622 }
5623 switch (known_errorHandler) {
5624 case 1: /* strict */
5625 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5626 goto onError;
5627 case 2: /* replace */
5628 for (p = collstart; p < collend; ++p)
5629 *output++ = '?';
5630 /* fall through */
5631 case 3: /* ignore */
5632 p = collend;
5633 break;
5634 case 4: /* xmlcharrefreplace */
5635 /* generate replacement (temporarily (mis)uses p) */
5636 for (p = collstart; p < collend; ++p)
5637 output += sprintf(output, "&#%d;", (int)*p);
5638 p = collend;
5639 break;
5640 default:
5641 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5642 encoding, reason, s, length, &exc,
5643 collstart-s, collend-s, &newpos);
5644 if (repunicode == NULL)
5645 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005646 if (!PyUnicode_Check(repunicode)) {
5647 /* Implementation limitation: byte results not supported yet. */
5648 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5649 Py_DECREF(repunicode);
5650 goto onError;
5651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 /* generate replacement */
5653 repsize = PyUnicode_GET_SIZE(repunicode);
5654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5655 Py_UNICODE ch = *uni2;
5656 if (Py_UNICODE_ISSPACE(ch))
5657 *output++ = ' ';
5658 else {
5659 decimal = Py_UNICODE_TODECIMAL(ch);
5660 if (decimal >= 0)
5661 *output++ = '0' + decimal;
5662 else if (0 < ch && ch < 256)
5663 *output++ = (char)ch;
5664 else {
5665 Py_DECREF(repunicode);
5666 raise_encode_exception(&exc, encoding,
5667 s, length, collstart-s, collend-s, reason);
5668 goto onError;
5669 }
5670 }
5671 }
5672 p = s + newpos;
5673 Py_DECREF(repunicode);
5674 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005675 }
5676 /* 0-terminate the output string */
5677 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 Py_XDECREF(exc);
5679 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005680 return 0;
5681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_XDECREF(exc);
5684 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005685 return -1;
5686}
5687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688/* --- Helpers ------------------------------------------------------------ */
5689
Eric Smith8c663262007-08-25 02:26:07 +00005690#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005691#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005692#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005693/* Include _ParseTupleFinds from find.h */
5694#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005695#include "stringlib/find.h"
5696#include "stringlib/partition.h"
5697
Eric Smith5807c412008-05-11 21:00:57 +00005698#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005699#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005700#include "stringlib/localeutil.h"
5701
Thomas Wouters477c8d52006-05-27 19:21:47 +00005702/* helper macro to fixup start/end slice values */
5703#define FIX_START_END(obj) \
5704 if (start < 0) \
5705 start += (obj)->length; \
5706 if (start < 0) \
5707 start = 0; \
5708 if (end > (obj)->length) \
5709 end = (obj)->length; \
5710 if (end < 0) \
5711 end += (obj)->length; \
5712 if (end < 0) \
5713 end = 0;
5714
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005716 PyObject *substr,
5717 Py_ssize_t start,
5718 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005721 PyUnicodeObject* str_obj;
5722 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005723
Thomas Wouters477c8d52006-05-27 19:21:47 +00005724 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5725 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005727 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5728 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 Py_DECREF(str_obj);
5730 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 }
Tim Petersced69f82003-09-16 20:30:58 +00005732
Thomas Wouters477c8d52006-05-27 19:21:47 +00005733 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005734
Thomas Wouters477c8d52006-05-27 19:21:47 +00005735 result = stringlib_count(
5736 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5737 );
5738
5739 Py_DECREF(sub_obj);
5740 Py_DECREF(str_obj);
5741
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 return result;
5743}
5744
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005746 PyObject *sub,
5747 Py_ssize_t start,
5748 Py_ssize_t end,
5749 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005754 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005756 sub = PyUnicode_FromObject(sub);
5757 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 Py_DECREF(str);
5759 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 }
Tim Petersced69f82003-09-16 20:30:58 +00005761
Thomas Wouters477c8d52006-05-27 19:21:47 +00005762 if (direction > 0)
5763 result = stringlib_find_slice(
5764 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5765 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5766 start, end
5767 );
5768 else
5769 result = stringlib_rfind_slice(
5770 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5771 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5772 start, end
5773 );
5774
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005776 Py_DECREF(sub);
5777
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return result;
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 PyUnicodeObject *substring,
5784 Py_ssize_t start,
5785 Py_ssize_t end,
5786 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 if (substring->length == 0)
5789 return 1;
5790
Thomas Wouters477c8d52006-05-27 19:21:47 +00005791 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
5793 end -= substring->length;
5794 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
5797 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 if (Py_UNICODE_MATCH(self, end, substring))
5799 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 } else {
5801 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 }
5804
5805 return 0;
5806}
5807
Martin v. Löwis18e16552006-02-15 17:27:45 +00005808Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 PyObject *substr,
5810 Py_ssize_t start,
5811 Py_ssize_t end,
5812 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005815
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 str = PyUnicode_FromObject(str);
5817 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 substr = PyUnicode_FromObject(substr);
5820 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 Py_DECREF(str);
5822 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 }
Tim Petersced69f82003-09-16 20:30:58 +00005824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 (PyUnicodeObject *)substr,
5827 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 Py_DECREF(str);
5829 Py_DECREF(substr);
5830 return result;
5831}
5832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833/* Apply fixfct filter to the Unicode object self and return a
5834 reference to the modified object */
5835
Tim Petersced69f82003-09-16 20:30:58 +00005836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839{
5840
5841 PyUnicodeObject *u;
5842
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005843 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005846
5847 Py_UNICODE_COPY(u->str, self->str, self->length);
5848
Tim Peters7a29bd52001-09-12 03:03:31 +00005849 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 /* fixfct should return TRUE if it modified the buffer. If
5851 FALSE, return a reference to the original buffer instead
5852 (to save space, not time) */
5853 Py_INCREF(self);
5854 Py_DECREF(u);
5855 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 }
5857 return (PyObject*) u;
5858}
5859
Tim Petersced69f82003-09-16 20:30:58 +00005860static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861int fixupper(PyUnicodeObject *self)
5862{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 Py_UNICODE *s = self->str;
5865 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ch = Py_UNICODE_TOUPPER(*s);
5871 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 *s = ch;
5874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 s++;
5876 }
5877
5878 return status;
5879}
5880
Tim Petersced69f82003-09-16 20:30:58 +00005881static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882int fixlower(PyUnicodeObject *self)
5883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005884 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 Py_UNICODE *s = self->str;
5886 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005890
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 ch = Py_UNICODE_TOLOWER(*s);
5892 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 *s = ch;
5895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 s++;
5897 }
5898
5899 return status;
5900}
5901
Tim Petersced69f82003-09-16 20:30:58 +00005902static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903int fixswapcase(PyUnicodeObject *self)
5904{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005905 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 Py_UNICODE *s = self->str;
5907 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005908
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 while (len-- > 0) {
5910 if (Py_UNICODE_ISUPPER(*s)) {
5911 *s = Py_UNICODE_TOLOWER(*s);
5912 status = 1;
5913 } else if (Py_UNICODE_ISLOWER(*s)) {
5914 *s = Py_UNICODE_TOUPPER(*s);
5915 status = 1;
5916 }
5917 s++;
5918 }
5919
5920 return status;
5921}
5922
Tim Petersced69f82003-09-16 20:30:58 +00005923static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924int fixcapitalize(PyUnicodeObject *self)
5925{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005927 Py_UNICODE *s = self->str;
5928 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005929
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005930 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005932 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 *s = Py_UNICODE_TOUPPER(*s);
5934 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005936 s++;
5937 while (--len > 0) {
5938 if (Py_UNICODE_ISUPPER(*s)) {
5939 *s = Py_UNICODE_TOLOWER(*s);
5940 status = 1;
5941 }
5942 s++;
5943 }
5944 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
5947static
5948int fixtitle(PyUnicodeObject *self)
5949{
5950 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5951 register Py_UNICODE *e;
5952 int previous_is_cased;
5953
5954 /* Shortcut for single character strings */
5955 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5957 if (*p != ch) {
5958 *p = ch;
5959 return 1;
5960 }
5961 else
5962 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 }
Tim Petersced69f82003-09-16 20:30:58 +00005964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 e = p + PyUnicode_GET_SIZE(self);
5966 previous_is_cased = 0;
5967 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 if (previous_is_cased)
5971 *p = Py_UNICODE_TOLOWER(ch);
5972 else
5973 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005974
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 if (Py_UNICODE_ISLOWER(ch) ||
5976 Py_UNICODE_ISUPPER(ch) ||
5977 Py_UNICODE_ISTITLE(ch))
5978 previous_is_cased = 1;
5979 else
5980 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 }
5982 return 1;
5983}
5984
Tim Peters8ce9f162004-08-27 01:49:32 +00005985PyObject *
5986PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987{
Skip Montanaro6543b452004-09-16 03:28:13 +00005988 const Py_UNICODE blank = ' ';
5989 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005991 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005992 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5993 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005994 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5995 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005996 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005997 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Tim Peters05eba1f2004-08-27 21:32:02 +00005999 fseq = PySequence_Fast(seq, "");
6000 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006001 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006002 }
6003
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006004 /* NOTE: the following code can't call back into Python code,
6005 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006006 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006007
Tim Peters05eba1f2004-08-27 21:32:02 +00006008 seqlen = PySequence_Fast_GET_SIZE(fseq);
6009 /* If empty sequence, return u"". */
6010 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006011 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6012 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006013 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006014 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006015 /* If singleton sequence with an exact Unicode, return that. */
6016 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 item = items[0];
6018 if (PyUnicode_CheckExact(item)) {
6019 Py_INCREF(item);
6020 res = (PyUnicodeObject *)item;
6021 goto Done;
6022 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006023 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006024 else {
6025 /* Set up sep and seplen */
6026 if (separator == NULL) {
6027 sep = &blank;
6028 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006029 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006030 else {
6031 if (!PyUnicode_Check(separator)) {
6032 PyErr_Format(PyExc_TypeError,
6033 "separator: expected str instance,"
6034 " %.80s found",
6035 Py_TYPE(separator)->tp_name);
6036 goto onError;
6037 }
6038 sep = PyUnicode_AS_UNICODE(separator);
6039 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006040 }
6041 }
6042
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006043 /* There are at least two things to join, or else we have a subclass
6044 * of str in the sequence.
6045 * Do a pre-pass to figure out the total amount of space we'll
6046 * need (sz), and see whether all argument are strings.
6047 */
6048 sz = 0;
6049 for (i = 0; i < seqlen; i++) {
6050 const Py_ssize_t old_sz = sz;
6051 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 if (!PyUnicode_Check(item)) {
6053 PyErr_Format(PyExc_TypeError,
6054 "sequence item %zd: expected str instance,"
6055 " %.80s found",
6056 i, Py_TYPE(item)->tp_name);
6057 goto onError;
6058 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006059 sz += PyUnicode_GET_SIZE(item);
6060 if (i != 0)
6061 sz += seplen;
6062 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6063 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006065 goto onError;
6066 }
6067 }
Tim Petersced69f82003-09-16 20:30:58 +00006068
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006069 res = _PyUnicode_New(sz);
6070 if (res == NULL)
6071 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006072
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006073 /* Catenate everything. */
6074 res_p = PyUnicode_AS_UNICODE(res);
6075 for (i = 0; i < seqlen; ++i) {
6076 Py_ssize_t itemlen;
6077 item = items[i];
6078 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 /* Copy item, and maybe the separator. */
6080 if (i) {
6081 Py_UNICODE_COPY(res_p, sep, seplen);
6082 res_p += seplen;
6083 }
6084 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6085 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006086 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006089 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 return (PyObject *)res;
6091
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006093 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006094 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 return NULL;
6096}
6097
Tim Petersced69f82003-09-16 20:30:58 +00006098static
6099PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 Py_ssize_t left,
6101 Py_ssize_t right,
6102 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 PyUnicodeObject *u;
6105
6106 if (left < 0)
6107 left = 0;
6108 if (right < 0)
6109 right = 0;
6110
Tim Peters7a29bd52001-09-12 03:03:31 +00006111 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 Py_INCREF(self);
6113 return self;
6114 }
6115
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006116 if (left > PY_SSIZE_T_MAX - self->length ||
6117 right > PY_SSIZE_T_MAX - (left + self->length)) {
6118 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6119 return NULL;
6120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 u = _PyUnicode_New(left + self->length + right);
6122 if (u) {
6123 if (left)
6124 Py_UNICODE_FILL(u->str, fill, left);
6125 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6126 if (right)
6127 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6128 }
6129
6130 return u;
6131}
6132
Benjamin Peterson29060642009-01-31 22:14:21 +00006133#define SPLIT_APPEND(data, left, right) \
6134 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6135 if (!str) \
6136 goto onError; \
6137 if (PyList_Append(list, str)) { \
6138 Py_DECREF(str); \
6139 goto onError; \
6140 } \
6141 else \
6142 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
6144static
6145PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 PyObject *list,
6147 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006149 register Py_ssize_t i;
6150 register Py_ssize_t j;
6151 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006153 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
6155 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006157 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006159 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6161 i++;
6162 if (j < i) {
6163 if (maxcount-- <= 0)
6164 break;
6165 SPLIT_APPEND(buf, j, i);
6166 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6167 i++;
6168 j = i;
6169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 }
6171 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
6174 return list;
6175
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 Py_DECREF(list);
6178 return NULL;
6179}
6180
6181PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 register Py_ssize_t i;
6185 register Py_ssize_t j;
6186 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 PyObject *list;
6188 PyObject *str;
6189 Py_UNICODE *data;
6190
6191 string = PyUnicode_FromObject(string);
6192 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 data = PyUnicode_AS_UNICODE(string);
6195 len = PyUnicode_GET_SIZE(string);
6196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 list = PyList_New(0);
6198 if (!list)
6199 goto onError;
6200
6201 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* Find a line and append it */
6205 while (i < len && !BLOOM_LINEBREAK(data[i]))
6206 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 if (i < len) {
6211 if (data[i] == '\r' && i + 1 < len &&
6212 data[i+1] == '\n')
6213 i += 2;
6214 else
6215 i++;
6216 if (keepends)
6217 eol = i;
6218 }
6219 SPLIT_APPEND(data, j, eol);
6220 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 }
6222 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 }
6225
6226 Py_DECREF(string);
6227 return list;
6228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006230 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 Py_DECREF(string);
6232 return NULL;
6233}
6234
Tim Petersced69f82003-09-16 20:30:58 +00006235static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 PyObject *list,
6238 Py_UNICODE ch,
6239 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 register Py_ssize_t i;
6242 register Py_ssize_t j;
6243 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006245 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246
6247 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 if (buf[i] == ch) {
6249 if (maxcount-- <= 0)
6250 break;
6251 SPLIT_APPEND(buf, j, i);
6252 i = j = i + 1;
6253 } else
6254 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 }
6256 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
6259 return list;
6260
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 Py_DECREF(list);
6263 return NULL;
6264}
6265
Tim Petersced69f82003-09-16 20:30:58 +00006266static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 PyObject *list,
6269 PyUnicodeObject *substring,
6270 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006272 register Py_ssize_t i;
6273 register Py_ssize_t j;
6274 Py_ssize_t len = self->length;
6275 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 PyObject *str;
6277
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006278 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 if (Py_UNICODE_MATCH(self, i, substring)) {
6280 if (maxcount-- <= 0)
6281 break;
6282 SPLIT_APPEND(self->str, j, i);
6283 i = j = i + sublen;
6284 } else
6285 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 }
6287 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 }
6290 return list;
6291
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 Py_DECREF(list);
6294 return NULL;
6295}
6296
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006297static
6298PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 PyObject *list,
6300 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 register Py_ssize_t i;
6303 register Py_ssize_t j;
6304 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006305 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006306 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006307
6308 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006310 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006312 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6314 i--;
6315 if (j > i) {
6316 if (maxcount-- <= 0)
6317 break;
6318 SPLIT_APPEND(buf, i + 1, j + 1);
6319 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6320 i--;
6321 j = i;
6322 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006323 }
6324 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006326 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006327 if (PyList_Reverse(list) < 0)
6328 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006329 return list;
6330
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006332 Py_DECREF(list);
6333 return NULL;
6334}
6335
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006337PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 PyObject *list,
6339 Py_UNICODE ch,
6340 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006342 register Py_ssize_t i;
6343 register Py_ssize_t j;
6344 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006345 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006346 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006347
6348 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 if (buf[i] == ch) {
6350 if (maxcount-- <= 0)
6351 break;
6352 SPLIT_APPEND(buf, i + 1, j + 1);
6353 j = i = i - 1;
6354 } else
6355 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006356 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006357 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006359 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006360 if (PyList_Reverse(list) < 0)
6361 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006362 return list;
6363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006365 Py_DECREF(list);
6366 return NULL;
6367}
6368
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006370PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 PyObject *list,
6372 PyUnicodeObject *substring,
6373 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006374{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006375 register Py_ssize_t i;
6376 register Py_ssize_t j;
6377 Py_ssize_t len = self->length;
6378 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006379 PyObject *str;
6380
6381 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 if (Py_UNICODE_MATCH(self, i, substring)) {
6383 if (maxcount-- <= 0)
6384 break;
6385 SPLIT_APPEND(self->str, i + sublen, j);
6386 j = i;
6387 i -= sublen;
6388 } else
6389 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006390 }
6391 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006393 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 if (PyList_Reverse(list) < 0)
6395 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006396 return list;
6397
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006399 Py_DECREF(list);
6400 return NULL;
6401}
6402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403#undef SPLIT_APPEND
6404
6405static
6406PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 PyUnicodeObject *substring,
6408 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409{
6410 PyObject *list;
6411
6412 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006413 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
6415 list = PyList_New(0);
6416 if (!list)
6417 return NULL;
6418
6419 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
6422 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 Py_DECREF(list);
6427 PyErr_SetString(PyExc_ValueError, "empty separator");
6428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 }
6430 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432}
6433
Tim Petersced69f82003-09-16 20:30:58 +00006434static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006435PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 PyUnicodeObject *substring,
6437 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006438{
6439 PyObject *list;
6440
6441 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006442 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443
6444 list = PyList_New(0);
6445 if (!list)
6446 return NULL;
6447
6448 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006450
6451 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006453
6454 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 Py_DECREF(list);
6456 PyErr_SetString(PyExc_ValueError, "empty separator");
6457 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006458 }
6459 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006461}
6462
6463static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 PyUnicodeObject *str1,
6466 PyUnicodeObject *str2,
6467 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468{
6469 PyUnicodeObject *u;
6470
6471 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
Thomas Wouters477c8d52006-05-27 19:21:47 +00006474 if (str1->length == str2->length) {
6475 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006476 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (str1->length == 1) {
6478 /* replace characters */
6479 Py_UNICODE u1, u2;
6480 if (!findchar(self->str, self->length, str1->str[0]))
6481 goto nothing;
6482 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6483 if (!u)
6484 return NULL;
6485 Py_UNICODE_COPY(u->str, self->str, self->length);
6486 u1 = str1->str[0];
6487 u2 = str2->str[0];
6488 for (i = 0; i < u->length; i++)
6489 if (u->str[i] == u1) {
6490 if (--maxcount < 0)
6491 break;
6492 u->str[i] = u2;
6493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006495 i = fastsearch(
6496 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006498 if (i < 0)
6499 goto nothing;
6500 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6501 if (!u)
6502 return NULL;
6503 Py_UNICODE_COPY(u->str, self->str, self->length);
6504 while (i <= self->length - str1->length)
6505 if (Py_UNICODE_MATCH(self, i, str1)) {
6506 if (--maxcount < 0)
6507 break;
6508 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6509 i += str1->length;
6510 } else
6511 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514
6515 Py_ssize_t n, i, j, e;
6516 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 Py_UNICODE *p;
6518
6519 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 if (n > maxcount)
6522 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 if (n == 0)
6524 goto nothing;
6525 /* new_size = self->length + n * (str2->length - str1->length)); */
6526 delta = (str2->length - str1->length);
6527 if (delta == 0) {
6528 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 product = n * (str2->length - str1->length);
6531 if ((product / (str2->length - str1->length)) != n) {
6532 PyErr_SetString(PyExc_OverflowError,
6533 "replace string is too long");
6534 return NULL;
6535 }
6536 new_size = self->length + product;
6537 if (new_size < 0) {
6538 PyErr_SetString(PyExc_OverflowError,
6539 "replace string is too long");
6540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 }
6542 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 u = _PyUnicode_New(new_size);
6544 if (!u)
6545 return NULL;
6546 i = 0;
6547 p = u->str;
6548 e = self->length - str1->length;
6549 if (str1->length > 0) {
6550 while (n-- > 0) {
6551 /* look for next match */
6552 j = i;
6553 while (j <= e) {
6554 if (Py_UNICODE_MATCH(self, j, str1))
6555 break;
6556 j++;
6557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006559 if (j > e)
6560 break;
6561 /* copy unchanged part [i:j] */
6562 Py_UNICODE_COPY(p, self->str+i, j-i);
6563 p += j - i;
6564 }
6565 /* copy substitution string */
6566 if (str2->length > 0) {
6567 Py_UNICODE_COPY(p, str2->str, str2->length);
6568 p += str2->length;
6569 }
6570 i = j + str1->length;
6571 }
6572 if (i < self->length)
6573 /* copy tail [i:] */
6574 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6575 } else {
6576 /* interleave */
6577 while (n > 0) {
6578 Py_UNICODE_COPY(p, str2->str, str2->length);
6579 p += str2->length;
6580 if (--n <= 0)
6581 break;
6582 *p++ = self->str[i++];
6583 }
6584 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590 /* nothing to replace; return original string (when possible) */
6591 if (PyUnicode_CheckExact(self)) {
6592 Py_INCREF(self);
6593 return (PyObject *) self;
6594 }
6595 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596}
6597
6598/* --- Unicode Object Methods --------------------------------------------- */
6599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602\n\
6603Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006604characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
6606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006607unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return fixup(self, fixtitle);
6610}
6611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006612PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614\n\
6615Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
6618static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006619unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 return fixup(self, fixcapitalize);
6622}
6623
6624#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006625PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627\n\
6628Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
6631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006632unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633{
6634 PyObject *list;
6635 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 /* Split into words */
6639 list = split(self, NULL, -1);
6640 if (!list)
6641 return NULL;
6642
6643 /* Capitalize each word */
6644 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6645 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 if (item == NULL)
6648 goto onError;
6649 Py_DECREF(PyList_GET_ITEM(list, i));
6650 PyList_SET_ITEM(list, i, item);
6651 }
6652
6653 /* Join the words to form a new string */
6654 item = PyUnicode_Join(NULL, list);
6655
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 Py_DECREF(list);
6658 return (PyObject *)item;
6659}
6660#endif
6661
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006662/* Argument converter. Coerces to a single unicode character */
6663
6664static int
6665convert_uc(PyObject *obj, void *addr)
6666{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006667 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6668 PyObject *uniobj;
6669 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006670
Benjamin Peterson14339b62009-01-31 16:36:08 +00006671 uniobj = PyUnicode_FromObject(obj);
6672 if (uniobj == NULL) {
6673 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006675 return 0;
6676 }
6677 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6678 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006680 Py_DECREF(uniobj);
6681 return 0;
6682 }
6683 unistr = PyUnicode_AS_UNICODE(uniobj);
6684 *fillcharloc = unistr[0];
6685 Py_DECREF(uniobj);
6686 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006687}
6688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006692Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006693done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
6695static PyObject *
6696unicode_center(PyUnicodeObject *self, PyObject *args)
6697{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006698 Py_ssize_t marg, left;
6699 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006700 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
Thomas Woutersde017742006-02-16 19:34:37 +00006702 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 return NULL;
6704
Tim Peters7a29bd52001-09-12 03:03:31 +00006705 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 Py_INCREF(self);
6707 return (PyObject*) self;
6708 }
6709
6710 marg = width - self->length;
6711 left = marg / 2 + (marg & width & 1);
6712
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006713 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Marc-André Lemburge5034372000-08-08 08:04:29 +00006716#if 0
6717
6718/* This code should go into some future Unicode collation support
6719 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006720 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006721
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006722/* speedy UTF-16 code point order comparison */
6723/* gleaned from: */
6724/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6725
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006726static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006727{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006728 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006729 0, 0, 0, 0, 0, 0, 0, 0,
6730 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006731 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006732};
6733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734static int
6735unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6736{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006737 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 Py_UNICODE *s1 = str1->str;
6740 Py_UNICODE *s2 = str2->str;
6741
6742 len1 = str1->length;
6743 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006746 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006747
6748 c1 = *s1++;
6749 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 if (c1 > (1<<11) * 26)
6752 c1 += utf16Fixup[c1>>11];
6753 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006754 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006755 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006756
6757 if (c1 != c2)
6758 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006759
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006760 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 }
6762
6763 return (len1 < len2) ? -1 : (len1 != len2);
6764}
6765
Marc-André Lemburge5034372000-08-08 08:04:29 +00006766#else
6767
6768static int
6769unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6770{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006771 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006772
6773 Py_UNICODE *s1 = str1->str;
6774 Py_UNICODE *s2 = str2->str;
6775
6776 len1 = str1->length;
6777 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006778
Marc-André Lemburge5034372000-08-08 08:04:29 +00006779 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006780 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006781
Fredrik Lundh45714e92001-06-26 16:39:36 +00006782 c1 = *s1++;
6783 c2 = *s2++;
6784
6785 if (c1 != c2)
6786 return (c1 < c2) ? -1 : 1;
6787
Marc-André Lemburge5034372000-08-08 08:04:29 +00006788 len1--; len2--;
6789 }
6790
6791 return (len1 < len2) ? -1 : (len1 != len2);
6792}
6793
6794#endif
6795
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006799 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6800 return unicode_compare((PyUnicodeObject *)left,
6801 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006802 PyErr_Format(PyExc_TypeError,
6803 "Can't compare %.100s and %.100s",
6804 left->ob_type->tp_name,
6805 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 return -1;
6807}
6808
Martin v. Löwis5b222132007-06-10 09:51:05 +00006809int
6810PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6811{
6812 int i;
6813 Py_UNICODE *id;
6814 assert(PyUnicode_Check(uni));
6815 id = PyUnicode_AS_UNICODE(uni);
6816 /* Compare Unicode string and source character set string */
6817 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 if (id[i] != str[i])
6819 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006820 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006822 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006824 return 0;
6825}
6826
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006827
Benjamin Peterson29060642009-01-31 22:14:21 +00006828#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006829 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006830
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006831PyObject *PyUnicode_RichCompare(PyObject *left,
6832 PyObject *right,
6833 int op)
6834{
6835 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006836
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006837 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6838 PyObject *v;
6839 if (((PyUnicodeObject *) left)->length !=
6840 ((PyUnicodeObject *) right)->length) {
6841 if (op == Py_EQ) {
6842 Py_INCREF(Py_False);
6843 return Py_False;
6844 }
6845 if (op == Py_NE) {
6846 Py_INCREF(Py_True);
6847 return Py_True;
6848 }
6849 }
6850 if (left == right)
6851 result = 0;
6852 else
6853 result = unicode_compare((PyUnicodeObject *)left,
6854 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006855
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006856 /* Convert the return value to a Boolean */
6857 switch (op) {
6858 case Py_EQ:
6859 v = TEST_COND(result == 0);
6860 break;
6861 case Py_NE:
6862 v = TEST_COND(result != 0);
6863 break;
6864 case Py_LE:
6865 v = TEST_COND(result <= 0);
6866 break;
6867 case Py_GE:
6868 v = TEST_COND(result >= 0);
6869 break;
6870 case Py_LT:
6871 v = TEST_COND(result == -1);
6872 break;
6873 case Py_GT:
6874 v = TEST_COND(result == 1);
6875 break;
6876 default:
6877 PyErr_BadArgument();
6878 return NULL;
6879 }
6880 Py_INCREF(v);
6881 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006884 Py_INCREF(Py_NotImplemented);
6885 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006886}
6887
Guido van Rossum403d68b2000-03-13 15:55:09 +00006888int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006890{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006891 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006893
6894 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006895 sub = PyUnicode_FromObject(element);
6896 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 PyErr_Format(PyExc_TypeError,
6898 "'in <string>' requires string as left operand, not %s",
6899 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006900 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006901 }
6902
Thomas Wouters477c8d52006-05-27 19:21:47 +00006903 str = PyUnicode_FromObject(container);
6904 if (!str) {
6905 Py_DECREF(sub);
6906 return -1;
6907 }
6908
6909 result = stringlib_contains_obj(str, sub);
6910
6911 Py_DECREF(str);
6912 Py_DECREF(sub);
6913
Guido van Rossum403d68b2000-03-13 15:55:09 +00006914 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006915}
6916
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917/* Concat to string or Unicode object giving a new Unicode object. */
6918
6919PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921{
6922 PyUnicodeObject *u = NULL, *v = NULL, *w;
6923
6924 /* Coerce the two arguments */
6925 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6926 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6929 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932 /* Shortcuts */
6933 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 Py_DECREF(v);
6935 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
6937 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 Py_DECREF(u);
6939 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
6941
6942 /* Concat the two Unicode strings */
6943 w = _PyUnicode_New(u->length + v->length);
6944 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 Py_UNICODE_COPY(w->str, u->str, u->length);
6947 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6948
6949 Py_DECREF(u);
6950 Py_DECREF(v);
6951 return (PyObject *)w;
6952
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 Py_XDECREF(u);
6955 Py_XDECREF(v);
6956 return NULL;
6957}
6958
Walter Dörwald1ab83302007-05-18 17:15:44 +00006959void
6960PyUnicode_Append(PyObject **pleft, PyObject *right)
6961{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 PyObject *new;
6963 if (*pleft == NULL)
6964 return;
6965 if (right == NULL || !PyUnicode_Check(*pleft)) {
6966 Py_DECREF(*pleft);
6967 *pleft = NULL;
6968 return;
6969 }
6970 new = PyUnicode_Concat(*pleft, right);
6971 Py_DECREF(*pleft);
6972 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006973}
6974
6975void
6976PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6977{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006978 PyUnicode_Append(pleft, right);
6979 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006980}
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006986string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject *
6990unicode_count(PyUnicodeObject *self, PyObject *args)
6991{
6992 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006993 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006994 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 PyObject *result;
6996
Guido van Rossumb8872e62000-05-09 14:14:27 +00006997 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
7000
7001 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007005
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007
Christian Heimes217cfd12007-12-02 14:31:20 +00007008 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 stringlib_count(self->str + start, end - start,
7010 substring->str, substring->length)
7011 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
7013 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 return result;
7016}
7017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007018PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007021Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007022to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007023handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7025'xmlcharrefreplace' as well as any other name registered with\n\
7026codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027
7028static PyObject *
7029unicode_encode(PyUnicodeObject *self, PyObject *args)
7030{
7031 char *encoding = NULL;
7032 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007033 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7036 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007037 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007038 if (v == NULL)
7039 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007040 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007041 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007042 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007043 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007044 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007045 Py_DECREF(v);
7046 return NULL;
7047 }
7048 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007049
Benjamin Peterson29060642009-01-31 22:14:21 +00007050 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007051 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
7057Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
7061unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7062{
7063 Py_UNICODE *e;
7064 Py_UNICODE *p;
7065 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 Py_UNICODE *qe;
7067 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 PyUnicodeObject *u;
7069 int tabsize = 8;
7070
7071 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
Thomas Wouters7e474022000-07-16 12:04:32 +00007074 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007075 i = 0; /* chars up to and including most recent \n or \r */
7076 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7077 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 for (p = self->str; p < e; p++)
7079 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 if (tabsize > 0) {
7081 incr = tabsize - (j % tabsize); /* cannot overflow */
7082 if (j > PY_SSIZE_T_MAX - incr)
7083 goto overflow1;
7084 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 if (j > PY_SSIZE_T_MAX - 1)
7089 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 j++;
7091 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 if (i > PY_SSIZE_T_MAX - j)
7093 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007095 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 }
7097 }
7098
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007099 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007101
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 /* Second pass: create output string and fill it */
7103 u = _PyUnicode_New(i + j);
7104 if (!u)
7105 return NULL;
7106
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007107 j = 0; /* same as in first pass */
7108 q = u->str; /* next output char */
7109 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111 for (p = self->str; p < e; p++)
7112 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 if (tabsize > 0) {
7114 i = tabsize - (j % tabsize);
7115 j += i;
7116 while (i--) {
7117 if (q >= qe)
7118 goto overflow2;
7119 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 else {
7124 if (q >= qe)
7125 goto overflow2;
7126 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007127 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 if (*p == '\n' || *p == '\r')
7129 j = 0;
7130 }
7131
7132 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007133
7134 overflow2:
7135 Py_DECREF(u);
7136 overflow1:
7137 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
7144Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007145such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146arguments start and end are interpreted as in slice notation.\n\
7147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
7151unicode_find(PyUnicodeObject *self, PyObject *args)
7152{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007154 Py_ssize_t start;
7155 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007156 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Christian Heimes9cd17752007-11-18 19:35:23 +00007158 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 result = stringlib_find_slice(
7162 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7163 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7164 start, end
7165 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168
Christian Heimes217cfd12007-12-02 14:31:20 +00007169 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170}
7171
7172static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007173unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174{
7175 if (index < 0 || index >= self->length) {
7176 PyErr_SetString(PyExc_IndexError, "string index out of range");
7177 return NULL;
7178 }
7179
7180 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7181}
7182
Guido van Rossumc2504932007-09-18 19:42:40 +00007183/* Believe it or not, this produces the same value for ASCII strings
7184 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007186unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187{
Guido van Rossumc2504932007-09-18 19:42:40 +00007188 Py_ssize_t len;
7189 Py_UNICODE *p;
7190 long x;
7191
7192 if (self->hash != -1)
7193 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007194 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007195 p = self->str;
7196 x = *p << 7;
7197 while (--len >= 0)
7198 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007199 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007200 if (x == -1)
7201 x = -2;
7202 self->hash = x;
7203 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204}
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211static PyObject *
7212unicode_index(PyUnicodeObject *self, PyObject *args)
7213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007215 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007216 Py_ssize_t start;
7217 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
Christian Heimes9cd17752007-11-18 19:35:23 +00007219 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221
Thomas Wouters477c8d52006-05-27 19:21:47 +00007222 result = stringlib_find_slice(
7223 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7224 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7225 start, end
7226 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 if (result < 0) {
7231 PyErr_SetString(PyExc_ValueError, "substring not found");
7232 return NULL;
7233 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007234
Christian Heimes217cfd12007-12-02 14:31:20 +00007235 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236}
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007241Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007242at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
7244static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007245unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
7247 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7248 register const Py_UNICODE *e;
7249 int cased;
7250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 /* Shortcut for single character strings */
7252 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007255 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007256 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 e = p + PyUnicode_GET_SIZE(self);
7260 cased = 0;
7261 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007263
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7265 return PyBool_FromLong(0);
7266 else if (!cased && Py_UNICODE_ISLOWER(ch))
7267 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007269 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270}
7271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007275Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
7278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007279unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280{
7281 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7282 register const Py_UNICODE *e;
7283 int cased;
7284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 /* Shortcut for single character strings */
7286 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007289 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007290 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 e = p + PyUnicode_GET_SIZE(self);
7294 cased = 0;
7295 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007297
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7299 return PyBool_FromLong(0);
7300 else if (!cased && Py_UNICODE_ISUPPER(ch))
7301 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007303 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304}
7305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007309Return True if S is a titlecased string and there is at least one\n\
7310character in S, i.e. upper- and titlecase characters may only\n\
7311follow uncased characters and lowercase characters only cased ones.\n\
7312Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007315unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316{
7317 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7318 register const Py_UNICODE *e;
7319 int cased, previous_is_cased;
7320
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 /* Shortcut for single character strings */
7322 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7324 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007326 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007327 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007329
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 e = p + PyUnicode_GET_SIZE(self);
7331 cased = 0;
7332 previous_is_cased = 0;
7333 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007335
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7337 if (previous_is_cased)
7338 return PyBool_FromLong(0);
7339 previous_is_cased = 1;
7340 cased = 1;
7341 }
7342 else if (Py_UNICODE_ISLOWER(ch)) {
7343 if (!previous_is_cased)
7344 return PyBool_FromLong(0);
7345 previous_is_cased = 1;
7346 cased = 1;
7347 }
7348 else
7349 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007351 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352}
7353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007357Return True if all characters in S are whitespace\n\
7358and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007361unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362{
7363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7364 register const Py_UNICODE *e;
7365
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 /* Shortcut for single character strings */
7367 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 Py_UNICODE_ISSPACE(*p))
7369 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007372 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007374
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 e = p + PyUnicode_GET_SIZE(self);
7376 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 if (!Py_UNICODE_ISSPACE(*p))
7378 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381}
7382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007385\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007386Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007388
7389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007390unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391{
7392 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7393 register const Py_UNICODE *e;
7394
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395 /* Shortcut for single character strings */
7396 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 Py_UNICODE_ISALPHA(*p))
7398 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007399
7400 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007401 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007403
7404 e = p + PyUnicode_GET_SIZE(self);
7405 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (!Py_UNICODE_ISALPHA(*p))
7407 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007409 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007410}
7411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007414\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007415Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007417
7418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007419unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007420{
7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422 register const Py_UNICODE *e;
7423
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007424 /* Shortcut for single character strings */
7425 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 Py_UNICODE_ISALNUM(*p))
7427 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007428
7429 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007430 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007432
7433 e = p + PyUnicode_GET_SIZE(self);
7434 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (!Py_UNICODE_ISALNUM(*p))
7436 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007444Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007448unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7451 register const Py_UNICODE *e;
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Shortcut for single character strings */
7454 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_UNICODE_ISDECIMAL(*p))
7456 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007459 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 e = p + PyUnicode_GET_SIZE(self);
7463 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (!Py_UNICODE_ISDECIMAL(*p))
7465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007473Return True if all characters in S are digits\n\
7474and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007477unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
7479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7480 register const Py_UNICODE *e;
7481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 /* Shortcut for single character strings */
7483 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 Py_UNICODE_ISDIGIT(*p))
7485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007488 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 e = p + PyUnicode_GET_SIZE(self);
7492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 if (!Py_UNICODE_ISDIGIT(*p))
7494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497}
7498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007502Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007506unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
7508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7509 register const Py_UNICODE *e;
7510
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 /* Shortcut for single character strings */
7512 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 Py_UNICODE_ISNUMERIC(*p))
7514 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007517 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 e = p + PyUnicode_GET_SIZE(self);
7521 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (!Py_UNICODE_ISNUMERIC(*p))
7523 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007525 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Martin v. Löwis47383402007-08-15 07:32:56 +00007528int
7529PyUnicode_IsIdentifier(PyObject *self)
7530{
7531 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7532 register const Py_UNICODE *e;
7533
7534 /* Special case for empty strings */
7535 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007537
7538 /* PEP 3131 says that the first character must be in
7539 XID_Start and subsequent characters in XID_Continue,
7540 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007541 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007542 letters, digits, underscore). However, given the current
7543 definition of XID_Start and XID_Continue, it is sufficient
7544 to check just for these, except that _ must be allowed
7545 as starting an identifier. */
7546 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7547 return 0;
7548
7549 e = p + PyUnicode_GET_SIZE(self);
7550 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 if (!_PyUnicode_IsXidContinue(*p))
7552 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007553 }
7554 return 1;
7555}
7556
7557PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007559\n\
7560Return True if S is a valid identifier according\n\
7561to the language definition.");
7562
7563static PyObject*
7564unicode_isidentifier(PyObject *self)
7565{
7566 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7567}
7568
Georg Brandl559e5d72008-06-11 18:37:52 +00007569PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007571\n\
7572Return True if all characters in S are considered\n\
7573printable in repr() or S is empty, False otherwise.");
7574
7575static PyObject*
7576unicode_isprintable(PyObject *self)
7577{
7578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7579 register const Py_UNICODE *e;
7580
7581 /* Shortcut for single character strings */
7582 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7583 Py_RETURN_TRUE;
7584 }
7585
7586 e = p + PyUnicode_GET_SIZE(self);
7587 for (; p < e; p++) {
7588 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7589 Py_RETURN_FALSE;
7590 }
7591 }
7592 Py_RETURN_TRUE;
7593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597\n\
7598Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007602unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007604 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
Martin v. Löwis18e16552006-02-15 17:27:45 +00007607static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608unicode_length(PyUnicodeObject *self)
7609{
7610 return self->length;
7611}
7612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007613PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007616Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007617done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
7619static PyObject *
7620unicode_ljust(PyUnicodeObject *self, PyObject *args)
7621{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007622 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007623 Py_UNICODE fillchar = ' ';
7624
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007625 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 return NULL;
7627
Tim Peters7a29bd52001-09-12 03:03:31 +00007628 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 Py_INCREF(self);
7630 return (PyObject*) self;
7631 }
7632
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007633 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634}
7635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007639Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007642unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return fixup(self, fixlower);
7645}
7646
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007647#define LEFTSTRIP 0
7648#define RIGHTSTRIP 1
7649#define BOTHSTRIP 2
7650
7651/* Arrays indexed by above */
7652static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7653
7654#define STRIPNAME(i) (stripformat[i]+3)
7655
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007656/* externally visible for str.strip(unicode) */
7657PyObject *
7658_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7659{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7661 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7662 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7663 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7664 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007667
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 i = 0;
7669 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7671 i++;
7672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007674
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 j = len;
7676 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 do {
7678 j--;
7679 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7680 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007681 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007682
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 Py_INCREF(self);
7685 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 }
7687 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689}
7690
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
7692static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007693do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7696 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697
Benjamin Peterson14339b62009-01-31 16:36:08 +00007698 i = 0;
7699 if (striptype != RIGHTSTRIP) {
7700 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7701 i++;
7702 }
7703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007704
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 j = len;
7706 if (striptype != LEFTSTRIP) {
7707 do {
7708 j--;
7709 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7710 j++;
7711 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007712
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7714 Py_INCREF(self);
7715 return (PyObject*)self;
7716 }
7717 else
7718 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719}
7720
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
7722static PyObject *
7723do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007726
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7728 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 if (sep != NULL && sep != Py_None) {
7731 if (PyUnicode_Check(sep))
7732 return _PyUnicode_XStrip(self, striptype, sep);
7733 else {
7734 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 "%s arg must be None or str",
7736 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007737 return NULL;
7738 }
7739 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007740
Benjamin Peterson14339b62009-01-31 16:36:08 +00007741 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007742}
7743
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007747\n\
7748Return a copy of the string S with leading and trailing\n\
7749whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007750If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751
7752static PyObject *
7753unicode_strip(PyUnicodeObject *self, PyObject *args)
7754{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 if (PyTuple_GET_SIZE(args) == 0)
7756 return do_strip(self, BOTHSTRIP); /* Common case */
7757 else
7758 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759}
7760
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007764\n\
7765Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007766If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007767
7768static PyObject *
7769unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 if (PyTuple_GET_SIZE(args) == 0)
7772 return do_strip(self, LEFTSTRIP); /* Common case */
7773 else
7774 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775}
7776
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007780\n\
7781Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007782If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783
7784static PyObject *
7785unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 if (PyTuple_GET_SIZE(args) == 0)
7788 return do_strip(self, RIGHTSTRIP); /* Common case */
7789 else
7790 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791}
7792
7793
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007795unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
7797 PyUnicodeObject *u;
7798 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007800 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
Georg Brandl222de0f2009-04-12 12:01:50 +00007802 if (len < 1) {
7803 Py_INCREF(unicode_empty);
7804 return (PyObject *)unicode_empty;
7805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806
Tim Peters7a29bd52001-09-12 03:03:31 +00007807 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 /* no repeat, return original string */
7809 Py_INCREF(str);
7810 return (PyObject*) str;
7811 }
Tim Peters8f422462000-09-09 06:13:41 +00007812
7813 /* ensure # of chars needed doesn't overflow int and # of bytes
7814 * needed doesn't overflow size_t
7815 */
7816 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007817 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007818 PyErr_SetString(PyExc_OverflowError,
7819 "repeated string is too long");
7820 return NULL;
7821 }
7822 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7823 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7824 PyErr_SetString(PyExc_OverflowError,
7825 "repeated string is too long");
7826 return NULL;
7827 }
7828 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 if (!u)
7830 return NULL;
7831
7832 p = u->str;
7833
Georg Brandl222de0f2009-04-12 12:01:50 +00007834 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007835 Py_UNICODE_FILL(p, str->str[0], len);
7836 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007837 Py_ssize_t done = str->length; /* number of characters copied this far */
7838 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007840 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007841 Py_UNICODE_COPY(p+done, p, n);
7842 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 }
7845
7846 return (PyObject*) u;
7847}
7848
7849PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 PyObject *subobj,
7851 PyObject *replobj,
7852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853{
7854 PyObject *self;
7855 PyObject *str1;
7856 PyObject *str2;
7857 PyObject *result;
7858
7859 self = PyUnicode_FromObject(obj);
7860 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 str1 = PyUnicode_FromObject(subobj);
7863 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 Py_DECREF(self);
7865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 }
7867 str2 = PyUnicode_FromObject(replobj);
7868 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 Py_DECREF(self);
7870 Py_DECREF(str1);
7871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 }
Tim Petersced69f82003-09-16 20:30:58 +00007873 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 (PyUnicodeObject *)str1,
7875 (PyUnicodeObject *)str2,
7876 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 Py_DECREF(self);
7878 Py_DECREF(str1);
7879 Py_DECREF(str2);
7880 return result;
7881}
7882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007883PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885\n\
7886Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007887old replaced by new. If the optional argument count is\n\
7888given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
7890static PyObject*
7891unicode_replace(PyUnicodeObject *self, PyObject *args)
7892{
7893 PyUnicodeObject *str1;
7894 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007895 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 PyObject *result;
7897
Martin v. Löwis18e16552006-02-15 17:27:45 +00007898 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 return NULL;
7900 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7901 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007904 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(str1);
7906 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908
7909 result = replace(self, str1, str2, maxcount);
7910
7911 Py_DECREF(str1);
7912 Py_DECREF(str2);
7913 return result;
7914}
7915
7916static
7917PyObject *unicode_repr(PyObject *unicode)
7918{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007919 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007920 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007921 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7922 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7923
7924 /* XXX(nnorwitz): rather than over-allocating, it would be
7925 better to choose a different scheme. Perhaps scan the
7926 first N-chars of the string and allocate based on that size.
7927 */
7928 /* Initial allocation is based on the longest-possible unichr
7929 escape.
7930
7931 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7932 unichr, so in this case it's the longest unichr escape. In
7933 narrow (UTF-16) builds this is five chars per source unichr
7934 since there are two unichrs in the surrogate pair, so in narrow
7935 (UTF-16) builds it's not the longest unichr escape.
7936
7937 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7938 so in the narrow (UTF-16) build case it's the longest unichr
7939 escape.
7940 */
7941
Walter Dörwald1ab83302007-05-18 17:15:44 +00007942 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007944#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007946#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007948#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007950 if (repr == NULL)
7951 return NULL;
7952
Walter Dörwald1ab83302007-05-18 17:15:44 +00007953 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007954
7955 /* Add quote */
7956 *p++ = (findchar(s, size, '\'') &&
7957 !findchar(s, size, '"')) ? '"' : '\'';
7958 while (size-- > 0) {
7959 Py_UNICODE ch = *s++;
7960
7961 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007962 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007963 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007964 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007965 continue;
7966 }
7967
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007969 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007970 *p++ = '\\';
7971 *p++ = 't';
7972 }
7973 else if (ch == '\n') {
7974 *p++ = '\\';
7975 *p++ = 'n';
7976 }
7977 else if (ch == '\r') {
7978 *p++ = '\\';
7979 *p++ = 'r';
7980 }
7981
7982 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007983 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007984 *p++ = '\\';
7985 *p++ = 'x';
7986 *p++ = hexdigits[(ch >> 4) & 0x000F];
7987 *p++ = hexdigits[ch & 0x000F];
7988 }
7989
Georg Brandl559e5d72008-06-11 18:37:52 +00007990 /* Copy ASCII characters as-is */
7991 else if (ch < 0x7F) {
7992 *p++ = ch;
7993 }
7994
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007996 else {
7997 Py_UCS4 ucs = ch;
7998
7999#ifndef Py_UNICODE_WIDE
8000 Py_UNICODE ch2 = 0;
8001 /* Get code point from surrogate pair */
8002 if (size > 0) {
8003 ch2 = *s;
8004 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008009 size--;
8010 }
8011 }
8012#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008014 (categories Z* and C* except ASCII space)
8015 */
8016 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8017 /* Map 8-bit characters to '\xhh' */
8018 if (ucs <= 0xff) {
8019 *p++ = '\\';
8020 *p++ = 'x';
8021 *p++ = hexdigits[(ch >> 4) & 0x000F];
8022 *p++ = hexdigits[ch & 0x000F];
8023 }
8024 /* Map 21-bit characters to '\U00xxxxxx' */
8025 else if (ucs >= 0x10000) {
8026 *p++ = '\\';
8027 *p++ = 'U';
8028 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8029 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8030 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8031 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8032 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8033 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8034 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8035 *p++ = hexdigits[ucs & 0x0000000F];
8036 }
8037 /* Map 16-bit characters to '\uxxxx' */
8038 else {
8039 *p++ = '\\';
8040 *p++ = 'u';
8041 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8042 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8043 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8044 *p++ = hexdigits[ucs & 0x000F];
8045 }
8046 }
8047 /* Copy characters as-is */
8048 else {
8049 *p++ = ch;
8050#ifndef Py_UNICODE_WIDE
8051 if (ucs >= 0x10000)
8052 *p++ = ch2;
8053#endif
8054 }
8055 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008056 }
8057 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008058 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008059
8060 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008061 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008062 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063}
8064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008065PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067\n\
8068Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008069such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070arguments start and end are interpreted as in slice notation.\n\
8071\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008072Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073
8074static PyObject *
8075unicode_rfind(PyUnicodeObject *self, PyObject *args)
8076{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008077 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008078 Py_ssize_t start;
8079 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008080 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
Christian Heimes9cd17752007-11-18 19:35:23 +00008082 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085 result = stringlib_rfind_slice(
8086 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8087 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8088 start, end
8089 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090
8091 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008092
Christian Heimes217cfd12007-12-02 14:31:20 +00008093 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094}
8095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008096PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008099Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100
8101static PyObject *
8102unicode_rindex(PyUnicodeObject *self, PyObject *args)
8103{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008104 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008105 Py_ssize_t start;
8106 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008107 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
Christian Heimes9cd17752007-11-18 19:35:23 +00008109 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111
Thomas Wouters477c8d52006-05-27 19:21:47 +00008112 result = stringlib_rfind_slice(
8113 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8114 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8115 start, end
8116 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
8118 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008119
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 if (result < 0) {
8121 PyErr_SetString(PyExc_ValueError, "substring not found");
8122 return NULL;
8123 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008124 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125}
8126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008127PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008130Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008131done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132
8133static PyObject *
8134unicode_rjust(PyUnicodeObject *self, PyObject *args)
8135{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008136 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008137 Py_UNICODE fillchar = ' ';
8138
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008139 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 return NULL;
8141
Tim Peters7a29bd52001-09-12 03:03:31 +00008142 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 Py_INCREF(self);
8144 return (PyObject*) self;
8145 }
8146
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008147 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148}
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 PyObject *sep,
8152 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153{
8154 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008155
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 s = PyUnicode_FromObject(s);
8157 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 if (sep != NULL) {
8160 sep = PyUnicode_FromObject(sep);
8161 if (sep == NULL) {
8162 Py_DECREF(s);
8163 return NULL;
8164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 }
8166
8167 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8168
8169 Py_DECREF(s);
8170 Py_XDECREF(sep);
8171 return result;
8172}
8173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176\n\
8177Return a list of the words in S, using sep as the\n\
8178delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008179splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008180whitespace string is a separator and empty strings are\n\
8181removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
8183static PyObject*
8184unicode_split(PyUnicodeObject *self, PyObject *args)
8185{
8186 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008187 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Martin v. Löwis18e16552006-02-15 17:27:45 +00008189 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 return NULL;
8191
8192 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198}
8199
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200PyObject *
8201PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8202{
8203 PyObject* str_obj;
8204 PyObject* sep_obj;
8205 PyObject* out;
8206
8207 str_obj = PyUnicode_FromObject(str_in);
8208 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210 sep_obj = PyUnicode_FromObject(sep_in);
8211 if (!sep_obj) {
8212 Py_DECREF(str_obj);
8213 return NULL;
8214 }
8215
8216 out = stringlib_partition(
8217 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8218 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8219 );
8220
8221 Py_DECREF(sep_obj);
8222 Py_DECREF(str_obj);
8223
8224 return out;
8225}
8226
8227
8228PyObject *
8229PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8230{
8231 PyObject* str_obj;
8232 PyObject* sep_obj;
8233 PyObject* out;
8234
8235 str_obj = PyUnicode_FromObject(str_in);
8236 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238 sep_obj = PyUnicode_FromObject(sep_in);
8239 if (!sep_obj) {
8240 Py_DECREF(str_obj);
8241 return NULL;
8242 }
8243
8244 out = stringlib_rpartition(
8245 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8246 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8247 );
8248
8249 Py_DECREF(sep_obj);
8250 Py_DECREF(str_obj);
8251
8252 return out;
8253}
8254
8255PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008257\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008258Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008260found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008261
8262static PyObject*
8263unicode_partition(PyUnicodeObject *self, PyObject *separator)
8264{
8265 return PyUnicode_Partition((PyObject *)self, separator);
8266}
8267
8268PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008271Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008272the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008273separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274
8275static PyObject*
8276unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8277{
8278 return PyUnicode_RPartition((PyObject *)self, separator);
8279}
8280
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008281PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 PyObject *sep,
8283 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008284{
8285 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008286
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008287 s = PyUnicode_FromObject(s);
8288 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 if (sep != NULL) {
8291 sep = PyUnicode_FromObject(sep);
8292 if (sep == NULL) {
8293 Py_DECREF(s);
8294 return NULL;
8295 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008296 }
8297
8298 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8299
8300 Py_DECREF(s);
8301 Py_XDECREF(sep);
8302 return result;
8303}
8304
8305PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008307\n\
8308Return a list of the words in S, using sep as the\n\
8309delimiter string, starting at the end of the string and\n\
8310working to the front. If maxsplit is given, at most maxsplit\n\
8311splits are done. If sep is not specified, any whitespace string\n\
8312is a separator.");
8313
8314static PyObject*
8315unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8316{
8317 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008318 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008319
Martin v. Löwis18e16552006-02-15 17:27:45 +00008320 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008321 return NULL;
8322
8323 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008325 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008327 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008329}
8330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008331PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333\n\
8334Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008335Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008336is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338static PyObject*
8339unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8340{
Guido van Rossum86662912000-04-11 15:38:46 +00008341 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Guido van Rossum86662912000-04-11 15:38:46 +00008343 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 return NULL;
8345
Guido van Rossum86662912000-04-11 15:38:46 +00008346 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347}
8348
8349static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008350PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
Walter Dörwald346737f2007-05-31 10:44:43 +00008352 if (PyUnicode_CheckExact(self)) {
8353 Py_INCREF(self);
8354 return self;
8355 } else
8356 /* Subtype -- return genuine unicode string with the same value. */
8357 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8358 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359}
8360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008361PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363\n\
8364Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008365and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
8367static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008368unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 return fixup(self, fixswapcase);
8371}
8372
Georg Brandlceee0772007-11-27 23:48:05 +00008373PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008375\n\
8376Return a translation table usable for str.translate().\n\
8377If there is only one argument, it must be a dictionary mapping Unicode\n\
8378ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008379Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008380If there are two arguments, they must be strings of equal length, and\n\
8381in the resulting dictionary, each character in x will be mapped to the\n\
8382character at the same position in y. If there is a third argument, it\n\
8383must be a string, whose characters will be mapped to None in the result.");
8384
8385static PyObject*
8386unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8387{
8388 PyObject *x, *y = NULL, *z = NULL;
8389 PyObject *new = NULL, *key, *value;
8390 Py_ssize_t i = 0;
8391 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392
Georg Brandlceee0772007-11-27 23:48:05 +00008393 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8394 return NULL;
8395 new = PyDict_New();
8396 if (!new)
8397 return NULL;
8398 if (y != NULL) {
8399 /* x must be a string too, of equal length */
8400 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8401 if (!PyUnicode_Check(x)) {
8402 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8403 "be a string if there is a second argument");
8404 goto err;
8405 }
8406 if (PyUnicode_GET_SIZE(x) != ylen) {
8407 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8408 "arguments must have equal length");
8409 goto err;
8410 }
8411 /* create entries for translating chars in x to those in y */
8412 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008413 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8414 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008415 if (!key || !value)
8416 goto err;
8417 res = PyDict_SetItem(new, key, value);
8418 Py_DECREF(key);
8419 Py_DECREF(value);
8420 if (res < 0)
8421 goto err;
8422 }
8423 /* create entries for deleting chars in z */
8424 if (z != NULL) {
8425 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008426 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008427 if (!key)
8428 goto err;
8429 res = PyDict_SetItem(new, key, Py_None);
8430 Py_DECREF(key);
8431 if (res < 0)
8432 goto err;
8433 }
8434 }
8435 } else {
8436 /* x must be a dict */
8437 if (!PyDict_Check(x)) {
8438 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8439 "to maketrans it must be a dict");
8440 goto err;
8441 }
8442 /* copy entries into the new dict, converting string keys to int keys */
8443 while (PyDict_Next(x, &i, &key, &value)) {
8444 if (PyUnicode_Check(key)) {
8445 /* convert string keys to integer keys */
8446 PyObject *newkey;
8447 if (PyUnicode_GET_SIZE(key) != 1) {
8448 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8449 "table must be of length 1");
8450 goto err;
8451 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008452 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008453 if (!newkey)
8454 goto err;
8455 res = PyDict_SetItem(new, newkey, value);
8456 Py_DECREF(newkey);
8457 if (res < 0)
8458 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008459 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008460 /* just keep integer keys */
8461 if (PyDict_SetItem(new, key, value) < 0)
8462 goto err;
8463 } else {
8464 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8465 "be strings or integers");
8466 goto err;
8467 }
8468 }
8469 }
8470 return new;
8471 err:
8472 Py_DECREF(new);
8473 return NULL;
8474}
8475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008476PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478\n\
8479Return a copy of the string S, where all characters have been mapped\n\
8480through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008481Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008482Unmapped characters are left untouched. Characters mapped to None\n\
8483are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008486unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Georg Brandlceee0772007-11-27 23:48:05 +00008488 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489}
8490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008491PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008494Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
8496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008497unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 return fixup(self, fixupper);
8500}
8501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008502PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008505Pad a numeric string S with zeros on the left, to fill a field\n\
8506of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
8508static PyObject *
8509unicode_zfill(PyUnicodeObject *self, PyObject *args)
8510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008511 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 PyUnicodeObject *u;
8513
Martin v. Löwis18e16552006-02-15 17:27:45 +00008514 Py_ssize_t width;
8515 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 return NULL;
8517
8518 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008519 if (PyUnicode_CheckExact(self)) {
8520 Py_INCREF(self);
8521 return (PyObject*) self;
8522 }
8523 else
8524 return PyUnicode_FromUnicode(
8525 PyUnicode_AS_UNICODE(self),
8526 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 }
8529
8530 fill = width - self->length;
8531
8532 u = pad(self, fill, 0, '0');
8533
Walter Dörwald068325e2002-04-15 13:36:47 +00008534 if (u == NULL)
8535 return NULL;
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 if (u->str[fill] == '+' || u->str[fill] == '-') {
8538 /* move sign to beginning of string */
8539 u->str[0] = u->str[fill];
8540 u->str[fill] = '0';
8541 }
8542
8543 return (PyObject*) u;
8544}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
8546#if 0
8547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008548unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
Christian Heimes2202f872008-02-06 14:31:34 +00008550 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551}
8552#endif
8553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008554PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008557Return True if S starts with the specified prefix, False otherwise.\n\
8558With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008559With optional end, stop comparing S at that position.\n\
8560prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
8562static PyObject *
8563unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008566 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008568 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008569 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008570 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008572 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8574 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 if (PyTuple_Check(subobj)) {
8576 Py_ssize_t i;
8577 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8578 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008580 if (substring == NULL)
8581 return NULL;
8582 result = tailmatch(self, substring, start, end, -1);
8583 Py_DECREF(substring);
8584 if (result) {
8585 Py_RETURN_TRUE;
8586 }
8587 }
8588 /* nothing matched */
8589 Py_RETURN_FALSE;
8590 }
8591 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008594 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008596 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597}
8598
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008603Return True if S ends with the specified suffix, False otherwise.\n\
8604With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008605With optional end, stop comparing S at that position.\n\
8606suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
8608static PyObject *
8609unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008612 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008614 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008615 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008616 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008618 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8620 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008621 if (PyTuple_Check(subobj)) {
8622 Py_ssize_t i;
8623 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8624 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008626 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008628 result = tailmatch(self, substring, start, end, +1);
8629 Py_DECREF(substring);
8630 if (result) {
8631 Py_RETURN_TRUE;
8632 }
8633 }
8634 Py_RETURN_FALSE;
8635 }
8636 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008640 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008642 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643}
8644
Eric Smith8c663262007-08-25 02:26:07 +00008645#include "stringlib/string_format.h"
8646
8647PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008649\n\
8650");
8651
Eric Smith4a7d76d2008-05-30 18:10:19 +00008652static PyObject *
8653unicode__format__(PyObject* self, PyObject* args)
8654{
8655 PyObject *format_spec;
8656
8657 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8658 return NULL;
8659
8660 return _PyUnicode_FormatAdvanced(self,
8661 PyUnicode_AS_UNICODE(format_spec),
8662 PyUnicode_GET_SIZE(format_spec));
8663}
8664
Eric Smith8c663262007-08-25 02:26:07 +00008665PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008667\n\
8668");
8669
8670static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008671unicode__sizeof__(PyUnicodeObject *v)
8672{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008673 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8674 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008675}
8676
8677PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008679
8680static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008681unicode_getnewargs(PyUnicodeObject *v)
8682{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008683 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008684}
8685
8686
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687static PyMethodDef unicode_methods[] = {
8688
8689 /* Order is according to common usage: often used methods should
8690 appear first, since lookup is done sequentially. */
8691
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008692 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8693 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8694 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008695 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008696 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8697 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8698 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8699 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8700 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8701 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8702 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008703 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008704 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8705 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8706 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008707 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008708 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8709 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8710 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008711 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008713 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008714 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008715 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8716 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8717 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8718 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8719 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8720 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8721 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8722 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8723 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8724 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8725 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8726 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8727 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8728 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008729 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008730 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008731 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008732 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008733 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008734 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8735 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008736 {"maketrans", (PyCFunction) unicode_maketrans,
8737 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008738 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008739#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008740 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741#endif
8742
8743#if 0
8744 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008745 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746#endif
8747
Benjamin Peterson14339b62009-01-31 16:36:08 +00008748 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 {NULL, NULL}
8750};
8751
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008752static PyObject *
8753unicode_mod(PyObject *v, PyObject *w)
8754{
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 if (!PyUnicode_Check(v)) {
8756 Py_INCREF(Py_NotImplemented);
8757 return Py_NotImplemented;
8758 }
8759 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008760}
8761
8762static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008763 0, /*nb_add*/
8764 0, /*nb_subtract*/
8765 0, /*nb_multiply*/
8766 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008767};
8768
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 (lenfunc) unicode_length, /* sq_length */
8771 PyUnicode_Concat, /* sq_concat */
8772 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8773 (ssizeargfunc) unicode_getitem, /* sq_item */
8774 0, /* sq_slice */
8775 0, /* sq_ass_item */
8776 0, /* sq_ass_slice */
8777 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778};
8779
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008780static PyObject*
8781unicode_subscript(PyUnicodeObject* self, PyObject* item)
8782{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008783 if (PyIndex_Check(item)) {
8784 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008785 if (i == -1 && PyErr_Occurred())
8786 return NULL;
8787 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008788 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008789 return unicode_getitem(self, i);
8790 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008791 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008792 Py_UNICODE* source_buf;
8793 Py_UNICODE* result_buf;
8794 PyObject* result;
8795
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008796 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008798 return NULL;
8799 }
8800
8801 if (slicelength <= 0) {
8802 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008803 } else if (start == 0 && step == 1 && slicelength == self->length &&
8804 PyUnicode_CheckExact(self)) {
8805 Py_INCREF(self);
8806 return (PyObject *)self;
8807 } else if (step == 1) {
8808 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008809 } else {
8810 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008811 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8812 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 if (result_buf == NULL)
8815 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008816
8817 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8818 result_buf[i] = source_buf[cur];
8819 }
Tim Petersced69f82003-09-16 20:30:58 +00008820
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008821 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008822 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008823 return result;
8824 }
8825 } else {
8826 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8827 return NULL;
8828 }
8829}
8830
8831static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 (lenfunc)unicode_length, /* mp_length */
8833 (binaryfunc)unicode_subscript, /* mp_subscript */
8834 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008835};
8836
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838/* Helpers for PyUnicode_Format() */
8839
8840static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008843 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 (*p_argidx)++;
8846 if (arglen < 0)
8847 return args;
8848 else
8849 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 }
8851 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 return NULL;
8854}
8855
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008856/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008858static PyObject *
8859formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008861 char *p;
8862 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008864
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 x = PyFloat_AsDouble(v);
8866 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008867 return NULL;
8868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008871
Eric Smith0923d1d2009-04-16 20:16:10 +00008872 p = PyOS_double_to_string(x, type, prec,
8873 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008874 if (p == NULL)
8875 return NULL;
8876 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008877 PyMem_Free(p);
8878 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879}
8880
Tim Peters38fd5b62000-09-21 05:43:11 +00008881static PyObject*
8882formatlong(PyObject *val, int flags, int prec, int type)
8883{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 char *buf;
8885 int len;
8886 PyObject *str; /* temporary string object. */
8887 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008888
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8890 if (!str)
8891 return NULL;
8892 result = PyUnicode_FromStringAndSize(buf, len);
8893 Py_DECREF(str);
8894 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008895}
8896
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897static int
8898formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008899 size_t buflen,
8900 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008902 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008903 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 if (PyUnicode_GET_SIZE(v) == 1) {
8905 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8906 buf[1] = '\0';
8907 return 1;
8908 }
8909#ifndef Py_UNICODE_WIDE
8910 if (PyUnicode_GET_SIZE(v) == 2) {
8911 /* Decode a valid surrogate pair */
8912 int c0 = PyUnicode_AS_UNICODE(v)[0];
8913 int c1 = PyUnicode_AS_UNICODE(v)[1];
8914 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8915 0xDC00 <= c1 && c1 <= 0xDFFF) {
8916 buf[0] = c0;
8917 buf[1] = c1;
8918 buf[2] = '\0';
8919 return 2;
8920 }
8921 }
8922#endif
8923 goto onError;
8924 }
8925 else {
8926 /* Integer input truncated to a character */
8927 long x;
8928 x = PyLong_AsLong(v);
8929 if (x == -1 && PyErr_Occurred())
8930 goto onError;
8931
8932 if (x < 0 || x > 0x10ffff) {
8933 PyErr_SetString(PyExc_OverflowError,
8934 "%c arg not in range(0x110000)");
8935 return -1;
8936 }
8937
8938#ifndef Py_UNICODE_WIDE
8939 if (x > 0xffff) {
8940 x -= 0x10000;
8941 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8942 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8943 return 2;
8944 }
8945#endif
8946 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008947 buf[1] = '\0';
8948 return 1;
8949 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008950
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008952 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008954 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955}
8956
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008957/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008958 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008959*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
8965 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008966 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 int args_owned = 0;
8968 PyUnicodeObject *result = NULL;
8969 PyObject *dict = NULL;
8970 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadInternalCall();
8974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
8976 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008977 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 fmt = PyUnicode_AS_UNICODE(uformat);
8980 fmtcnt = PyUnicode_GET_SIZE(uformat);
8981
8982 reslen = rescnt = fmtcnt + 100;
8983 result = _PyUnicode_New(reslen);
8984 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 res = PyUnicode_AS_UNICODE(result);
8987
8988 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 arglen = PyTuple_Size(args);
8990 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 }
8992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 arglen = -1;
8994 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008996 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008997 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999
9000 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 if (*fmt != '%') {
9002 if (--rescnt < 0) {
9003 rescnt = fmtcnt + 100;
9004 reslen += rescnt;
9005 if (_PyUnicode_Resize(&result, reslen) < 0)
9006 goto onError;
9007 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9008 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009011 }
9012 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 /* Got a format specifier */
9014 int flags = 0;
9015 Py_ssize_t width = -1;
9016 int prec = -1;
9017 Py_UNICODE c = '\0';
9018 Py_UNICODE fill;
9019 int isnumok;
9020 PyObject *v = NULL;
9021 PyObject *temp = NULL;
9022 Py_UNICODE *pbuf;
9023 Py_UNICODE sign;
9024 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009025 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 fmt++;
9028 if (*fmt == '(') {
9029 Py_UNICODE *keystart;
9030 Py_ssize_t keylen;
9031 PyObject *key;
9032 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 if (dict == NULL) {
9035 PyErr_SetString(PyExc_TypeError,
9036 "format requires a mapping");
9037 goto onError;
9038 }
9039 ++fmt;
9040 --fmtcnt;
9041 keystart = fmt;
9042 /* Skip over balanced parentheses */
9043 while (pcount > 0 && --fmtcnt >= 0) {
9044 if (*fmt == ')')
9045 --pcount;
9046 else if (*fmt == '(')
9047 ++pcount;
9048 fmt++;
9049 }
9050 keylen = fmt - keystart - 1;
9051 if (fmtcnt < 0 || pcount > 0) {
9052 PyErr_SetString(PyExc_ValueError,
9053 "incomplete format key");
9054 goto onError;
9055 }
9056#if 0
9057 /* keys are converted to strings using UTF-8 and
9058 then looked up since Python uses strings to hold
9059 variables names etc. in its namespaces and we
9060 wouldn't want to break common idioms. */
9061 key = PyUnicode_EncodeUTF8(keystart,
9062 keylen,
9063 NULL);
9064#else
9065 key = PyUnicode_FromUnicode(keystart, keylen);
9066#endif
9067 if (key == NULL)
9068 goto onError;
9069 if (args_owned) {
9070 Py_DECREF(args);
9071 args_owned = 0;
9072 }
9073 args = PyObject_GetItem(dict, key);
9074 Py_DECREF(key);
9075 if (args == NULL) {
9076 goto onError;
9077 }
9078 args_owned = 1;
9079 arglen = -1;
9080 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 while (--fmtcnt >= 0) {
9083 switch (c = *fmt++) {
9084 case '-': flags |= F_LJUST; continue;
9085 case '+': flags |= F_SIGN; continue;
9086 case ' ': flags |= F_BLANK; continue;
9087 case '#': flags |= F_ALT; continue;
9088 case '0': flags |= F_ZERO; continue;
9089 }
9090 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 if (c == '*') {
9093 v = getnextarg(args, arglen, &argidx);
9094 if (v == NULL)
9095 goto onError;
9096 if (!PyLong_Check(v)) {
9097 PyErr_SetString(PyExc_TypeError,
9098 "* wants int");
9099 goto onError;
9100 }
9101 width = PyLong_AsLong(v);
9102 if (width == -1 && PyErr_Occurred())
9103 goto onError;
9104 if (width < 0) {
9105 flags |= F_LJUST;
9106 width = -width;
9107 }
9108 if (--fmtcnt >= 0)
9109 c = *fmt++;
9110 }
9111 else if (c >= '0' && c <= '9') {
9112 width = c - '0';
9113 while (--fmtcnt >= 0) {
9114 c = *fmt++;
9115 if (c < '0' || c > '9')
9116 break;
9117 if ((width*10) / 10 != width) {
9118 PyErr_SetString(PyExc_ValueError,
9119 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 }
9122 width = width*10 + (c - '0');
9123 }
9124 }
9125 if (c == '.') {
9126 prec = 0;
9127 if (--fmtcnt >= 0)
9128 c = *fmt++;
9129 if (c == '*') {
9130 v = getnextarg(args, arglen, &argidx);
9131 if (v == NULL)
9132 goto onError;
9133 if (!PyLong_Check(v)) {
9134 PyErr_SetString(PyExc_TypeError,
9135 "* wants int");
9136 goto onError;
9137 }
9138 prec = PyLong_AsLong(v);
9139 if (prec == -1 && PyErr_Occurred())
9140 goto onError;
9141 if (prec < 0)
9142 prec = 0;
9143 if (--fmtcnt >= 0)
9144 c = *fmt++;
9145 }
9146 else if (c >= '0' && c <= '9') {
9147 prec = c - '0';
9148 while (--fmtcnt >= 0) {
9149 c = Py_CHARMASK(*fmt++);
9150 if (c < '0' || c > '9')
9151 break;
9152 if ((prec*10) / 10 != prec) {
9153 PyErr_SetString(PyExc_ValueError,
9154 "prec too big");
9155 goto onError;
9156 }
9157 prec = prec*10 + (c - '0');
9158 }
9159 }
9160 } /* prec */
9161 if (fmtcnt >= 0) {
9162 if (c == 'h' || c == 'l' || c == 'L') {
9163 if (--fmtcnt >= 0)
9164 c = *fmt++;
9165 }
9166 }
9167 if (fmtcnt < 0) {
9168 PyErr_SetString(PyExc_ValueError,
9169 "incomplete format");
9170 goto onError;
9171 }
9172 if (c != '%') {
9173 v = getnextarg(args, arglen, &argidx);
9174 if (v == NULL)
9175 goto onError;
9176 }
9177 sign = 0;
9178 fill = ' ';
9179 switch (c) {
9180
9181 case '%':
9182 pbuf = formatbuf;
9183 /* presume that buffer length is at least 1 */
9184 pbuf[0] = '%';
9185 len = 1;
9186 break;
9187
9188 case 's':
9189 case 'r':
9190 case 'a':
9191 if (PyUnicode_Check(v) && c == 's') {
9192 temp = v;
9193 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
9195 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 if (c == 's')
9197 temp = PyObject_Str(v);
9198 else if (c == 'r')
9199 temp = PyObject_Repr(v);
9200 else
9201 temp = PyObject_ASCII(v);
9202 if (temp == NULL)
9203 goto onError;
9204 if (PyUnicode_Check(temp))
9205 /* nothing to do */;
9206 else {
9207 Py_DECREF(temp);
9208 PyErr_SetString(PyExc_TypeError,
9209 "%s argument has non-string str()");
9210 goto onError;
9211 }
9212 }
9213 pbuf = PyUnicode_AS_UNICODE(temp);
9214 len = PyUnicode_GET_SIZE(temp);
9215 if (prec >= 0 && len > prec)
9216 len = prec;
9217 break;
9218
9219 case 'i':
9220 case 'd':
9221 case 'u':
9222 case 'o':
9223 case 'x':
9224 case 'X':
9225 if (c == 'i')
9226 c = 'd';
9227 isnumok = 0;
9228 if (PyNumber_Check(v)) {
9229 PyObject *iobj=NULL;
9230
9231 if (PyLong_Check(v)) {
9232 iobj = v;
9233 Py_INCREF(iobj);
9234 }
9235 else {
9236 iobj = PyNumber_Long(v);
9237 }
9238 if (iobj!=NULL) {
9239 if (PyLong_Check(iobj)) {
9240 isnumok = 1;
9241 temp = formatlong(iobj, flags, prec, c);
9242 Py_DECREF(iobj);
9243 if (!temp)
9244 goto onError;
9245 pbuf = PyUnicode_AS_UNICODE(temp);
9246 len = PyUnicode_GET_SIZE(temp);
9247 sign = 1;
9248 }
9249 else {
9250 Py_DECREF(iobj);
9251 }
9252 }
9253 }
9254 if (!isnumok) {
9255 PyErr_Format(PyExc_TypeError,
9256 "%%%c format: a number is required, "
9257 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9258 goto onError;
9259 }
9260 if (flags & F_ZERO)
9261 fill = '0';
9262 break;
9263
9264 case 'e':
9265 case 'E':
9266 case 'f':
9267 case 'F':
9268 case 'g':
9269 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009270 temp = formatfloat(v, flags, prec, c);
9271 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009272 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009273 pbuf = PyUnicode_AS_UNICODE(temp);
9274 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 sign = 1;
9276 if (flags & F_ZERO)
9277 fill = '0';
9278 break;
9279
9280 case 'c':
9281 pbuf = formatbuf;
9282 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9283 if (len < 0)
9284 goto onError;
9285 break;
9286
9287 default:
9288 PyErr_Format(PyExc_ValueError,
9289 "unsupported format character '%c' (0x%x) "
9290 "at index %zd",
9291 (31<=c && c<=126) ? (char)c : '?',
9292 (int)c,
9293 (Py_ssize_t)(fmt - 1 -
9294 PyUnicode_AS_UNICODE(uformat)));
9295 goto onError;
9296 }
9297 if (sign) {
9298 if (*pbuf == '-' || *pbuf == '+') {
9299 sign = *pbuf++;
9300 len--;
9301 }
9302 else if (flags & F_SIGN)
9303 sign = '+';
9304 else if (flags & F_BLANK)
9305 sign = ' ';
9306 else
9307 sign = 0;
9308 }
9309 if (width < len)
9310 width = len;
9311 if (rescnt - (sign != 0) < width) {
9312 reslen -= rescnt;
9313 rescnt = width + fmtcnt + 100;
9314 reslen += rescnt;
9315 if (reslen < 0) {
9316 Py_XDECREF(temp);
9317 PyErr_NoMemory();
9318 goto onError;
9319 }
9320 if (_PyUnicode_Resize(&result, reslen) < 0) {
9321 Py_XDECREF(temp);
9322 goto onError;
9323 }
9324 res = PyUnicode_AS_UNICODE(result)
9325 + reslen - rescnt;
9326 }
9327 if (sign) {
9328 if (fill != ' ')
9329 *res++ = sign;
9330 rescnt--;
9331 if (width > len)
9332 width--;
9333 }
9334 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9335 assert(pbuf[0] == '0');
9336 assert(pbuf[1] == c);
9337 if (fill != ' ') {
9338 *res++ = *pbuf++;
9339 *res++ = *pbuf++;
9340 }
9341 rescnt -= 2;
9342 width -= 2;
9343 if (width < 0)
9344 width = 0;
9345 len -= 2;
9346 }
9347 if (width > len && !(flags & F_LJUST)) {
9348 do {
9349 --rescnt;
9350 *res++ = fill;
9351 } while (--width > len);
9352 }
9353 if (fill == ' ') {
9354 if (sign)
9355 *res++ = sign;
9356 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9357 assert(pbuf[0] == '0');
9358 assert(pbuf[1] == c);
9359 *res++ = *pbuf++;
9360 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009361 }
9362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 Py_UNICODE_COPY(res, pbuf, len);
9364 res += len;
9365 rescnt -= len;
9366 while (--width >= len) {
9367 --rescnt;
9368 *res++ = ' ';
9369 }
9370 if (dict && (argidx < arglen) && c != '%') {
9371 PyErr_SetString(PyExc_TypeError,
9372 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009373 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 goto onError;
9375 }
9376 Py_XDECREF(temp);
9377 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 } /* until end */
9379 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 PyErr_SetString(PyExc_TypeError,
9381 "not all arguments converted during string formatting");
9382 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 }
9384
Thomas Woutersa96affe2006-03-12 00:29:36 +00009385 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 }
9390 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 return (PyObject *)result;
9392
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 Py_XDECREF(result);
9395 Py_DECREF(uformat);
9396 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 }
9399 return NULL;
9400}
9401
Jeremy Hylton938ace62002-07-17 16:30:39 +00009402static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009403unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9404
Tim Peters6d6c1a32001-08-02 04:15:00 +00009405static PyObject *
9406unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9407{
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 static char *kwlist[] = {"object", "encoding", "errors", 0};
9410 char *encoding = NULL;
9411 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009412
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 if (type != &PyUnicode_Type)
9414 return unicode_subtype_new(type, args, kwds);
9415 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009417 return NULL;
9418 if (x == NULL)
9419 return (PyObject *)_PyUnicode_New(0);
9420 if (encoding == NULL && errors == NULL)
9421 return PyObject_Str(x);
9422 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009424}
9425
Guido van Rossume023fe02001-08-30 03:12:59 +00009426static PyObject *
9427unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9428{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009429 PyUnicodeObject *tmp, *pnew;
9430 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009431
Benjamin Peterson14339b62009-01-31 16:36:08 +00009432 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9433 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9434 if (tmp == NULL)
9435 return NULL;
9436 assert(PyUnicode_Check(tmp));
9437 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9438 if (pnew == NULL) {
9439 Py_DECREF(tmp);
9440 return NULL;
9441 }
9442 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9443 if (pnew->str == NULL) {
9444 _Py_ForgetReference((PyObject *)pnew);
9445 PyObject_Del(pnew);
9446 Py_DECREF(tmp);
9447 return PyErr_NoMemory();
9448 }
9449 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9450 pnew->length = n;
9451 pnew->hash = tmp->hash;
9452 Py_DECREF(tmp);
9453 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009454}
9455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009456PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009458\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009459Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009460encoding defaults to the current default string encoding.\n\
9461errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009462
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009463static PyObject *unicode_iter(PyObject *seq);
9464
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009466 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009467 "str", /* tp_name */
9468 sizeof(PyUnicodeObject), /* tp_size */
9469 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009471 (destructor)unicode_dealloc, /* tp_dealloc */
9472 0, /* tp_print */
9473 0, /* tp_getattr */
9474 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009475 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009476 unicode_repr, /* tp_repr */
9477 &unicode_as_number, /* tp_as_number */
9478 &unicode_as_sequence, /* tp_as_sequence */
9479 &unicode_as_mapping, /* tp_as_mapping */
9480 (hashfunc) unicode_hash, /* tp_hash*/
9481 0, /* tp_call*/
9482 (reprfunc) unicode_str, /* tp_str */
9483 PyObject_GenericGetAttr, /* tp_getattro */
9484 0, /* tp_setattro */
9485 0, /* tp_as_buffer */
9486 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009488 unicode_doc, /* tp_doc */
9489 0, /* tp_traverse */
9490 0, /* tp_clear */
9491 PyUnicode_RichCompare, /* tp_richcompare */
9492 0, /* tp_weaklistoffset */
9493 unicode_iter, /* tp_iter */
9494 0, /* tp_iternext */
9495 unicode_methods, /* tp_methods */
9496 0, /* tp_members */
9497 0, /* tp_getset */
9498 &PyBaseObject_Type, /* tp_base */
9499 0, /* tp_dict */
9500 0, /* tp_descr_get */
9501 0, /* tp_descr_set */
9502 0, /* tp_dictoffset */
9503 0, /* tp_init */
9504 0, /* tp_alloc */
9505 unicode_new, /* tp_new */
9506 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507};
9508
9509/* Initialize the Unicode implementation */
9510
Thomas Wouters78890102000-07-22 19:25:51 +00009511void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009513 int i;
9514
Thomas Wouters477c8d52006-05-27 19:21:47 +00009515 /* XXX - move this array to unicodectype.c ? */
9516 Py_UNICODE linebreak[] = {
9517 0x000A, /* LINE FEED */
9518 0x000D, /* CARRIAGE RETURN */
9519 0x001C, /* FILE SEPARATOR */
9520 0x001D, /* GROUP SEPARATOR */
9521 0x001E, /* RECORD SEPARATOR */
9522 0x0085, /* NEXT LINE */
9523 0x2028, /* LINE SEPARATOR */
9524 0x2029, /* PARAGRAPH SEPARATOR */
9525 };
9526
Fred Drakee4315f52000-05-09 19:53:39 +00009527 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009528 free_list = NULL;
9529 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009531 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009533
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009534 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009536 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009538
9539 /* initialize the linebreak bloom filter */
9540 bloom_linebreak = make_bloom_mask(
9541 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9542 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009543
9544 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545}
9546
9547/* Finalize the Unicode implementation */
9548
Christian Heimesa156e092008-02-16 07:38:31 +00009549int
9550PyUnicode_ClearFreeList(void)
9551{
9552 int freelist_size = numfree;
9553 PyUnicodeObject *u;
9554
9555 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 PyUnicodeObject *v = u;
9557 u = *(PyUnicodeObject **)u;
9558 if (v->str)
9559 PyObject_DEL(v->str);
9560 Py_XDECREF(v->defenc);
9561 PyObject_Del(v);
9562 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009563 }
9564 free_list = NULL;
9565 assert(numfree == 0);
9566 return freelist_size;
9567}
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569void
Thomas Wouters78890102000-07-22 19:25:51 +00009570_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009572 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009574 Py_XDECREF(unicode_empty);
9575 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009576
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009577 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 if (unicode_latin1[i]) {
9579 Py_DECREF(unicode_latin1[i]);
9580 unicode_latin1[i] = NULL;
9581 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009582 }
Christian Heimesa156e092008-02-16 07:38:31 +00009583 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009585
Walter Dörwald16807132007-05-25 13:52:07 +00009586void
9587PyUnicode_InternInPlace(PyObject **p)
9588{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9590 PyObject *t;
9591 if (s == NULL || !PyUnicode_Check(s))
9592 Py_FatalError(
9593 "PyUnicode_InternInPlace: unicode strings only please!");
9594 /* If it's a subclass, we don't really know what putting
9595 it in the interned dict might do. */
9596 if (!PyUnicode_CheckExact(s))
9597 return;
9598 if (PyUnicode_CHECK_INTERNED(s))
9599 return;
9600 if (interned == NULL) {
9601 interned = PyDict_New();
9602 if (interned == NULL) {
9603 PyErr_Clear(); /* Don't leave an exception */
9604 return;
9605 }
9606 }
9607 /* It might be that the GetItem call fails even
9608 though the key is present in the dictionary,
9609 namely when this happens during a stack overflow. */
9610 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009613
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 if (t) {
9615 Py_INCREF(t);
9616 Py_DECREF(*p);
9617 *p = t;
9618 return;
9619 }
Walter Dörwald16807132007-05-25 13:52:07 +00009620
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 PyThreadState_GET()->recursion_critical = 1;
9622 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9623 PyErr_Clear();
9624 PyThreadState_GET()->recursion_critical = 0;
9625 return;
9626 }
9627 PyThreadState_GET()->recursion_critical = 0;
9628 /* The two references in interned are not counted by refcnt.
9629 The deallocator will take care of this */
9630 Py_REFCNT(s) -= 2;
9631 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009632}
9633
9634void
9635PyUnicode_InternImmortal(PyObject **p)
9636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009637 PyUnicode_InternInPlace(p);
9638 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9639 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9640 Py_INCREF(*p);
9641 }
Walter Dörwald16807132007-05-25 13:52:07 +00009642}
9643
9644PyObject *
9645PyUnicode_InternFromString(const char *cp)
9646{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009647 PyObject *s = PyUnicode_FromString(cp);
9648 if (s == NULL)
9649 return NULL;
9650 PyUnicode_InternInPlace(&s);
9651 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009652}
9653
9654void _Py_ReleaseInternedUnicodeStrings(void)
9655{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009656 PyObject *keys;
9657 PyUnicodeObject *s;
9658 Py_ssize_t i, n;
9659 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009660
Benjamin Peterson14339b62009-01-31 16:36:08 +00009661 if (interned == NULL || !PyDict_Check(interned))
9662 return;
9663 keys = PyDict_Keys(interned);
9664 if (keys == NULL || !PyList_Check(keys)) {
9665 PyErr_Clear();
9666 return;
9667 }
Walter Dörwald16807132007-05-25 13:52:07 +00009668
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9670 detector, interned unicode strings are not forcibly deallocated;
9671 rather, we give them their stolen references back, and then clear
9672 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009673
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 n = PyList_GET_SIZE(keys);
9675 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 for (i = 0; i < n; i++) {
9678 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9679 switch (s->state) {
9680 case SSTATE_NOT_INTERNED:
9681 /* XXX Shouldn't happen */
9682 break;
9683 case SSTATE_INTERNED_IMMORTAL:
9684 Py_REFCNT(s) += 1;
9685 immortal_size += s->length;
9686 break;
9687 case SSTATE_INTERNED_MORTAL:
9688 Py_REFCNT(s) += 2;
9689 mortal_size += s->length;
9690 break;
9691 default:
9692 Py_FatalError("Inconsistent interned string state.");
9693 }
9694 s->state = SSTATE_NOT_INTERNED;
9695 }
9696 fprintf(stderr, "total size of all interned strings: "
9697 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9698 "mortal/immortal\n", mortal_size, immortal_size);
9699 Py_DECREF(keys);
9700 PyDict_Clear(interned);
9701 Py_DECREF(interned);
9702 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009703}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009704
9705
9706/********************* Unicode Iterator **************************/
9707
9708typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009709 PyObject_HEAD
9710 Py_ssize_t it_index;
9711 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009712} unicodeiterobject;
9713
9714static void
9715unicodeiter_dealloc(unicodeiterobject *it)
9716{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009717 _PyObject_GC_UNTRACK(it);
9718 Py_XDECREF(it->it_seq);
9719 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009720}
9721
9722static int
9723unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 Py_VISIT(it->it_seq);
9726 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009727}
9728
9729static PyObject *
9730unicodeiter_next(unicodeiterobject *it)
9731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732 PyUnicodeObject *seq;
9733 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009734
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 assert(it != NULL);
9736 seq = it->it_seq;
9737 if (seq == NULL)
9738 return NULL;
9739 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009740
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9742 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 if (item != NULL)
9745 ++it->it_index;
9746 return item;
9747 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009748
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 Py_DECREF(seq);
9750 it->it_seq = NULL;
9751 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009752}
9753
9754static PyObject *
9755unicodeiter_len(unicodeiterobject *it)
9756{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009757 Py_ssize_t len = 0;
9758 if (it->it_seq)
9759 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9760 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009761}
9762
9763PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9764
9765static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009768 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009769};
9770
9771PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9773 "str_iterator", /* tp_name */
9774 sizeof(unicodeiterobject), /* tp_basicsize */
9775 0, /* tp_itemsize */
9776 /* methods */
9777 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9778 0, /* tp_print */
9779 0, /* tp_getattr */
9780 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009781 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782 0, /* tp_repr */
9783 0, /* tp_as_number */
9784 0, /* tp_as_sequence */
9785 0, /* tp_as_mapping */
9786 0, /* tp_hash */
9787 0, /* tp_call */
9788 0, /* tp_str */
9789 PyObject_GenericGetAttr, /* tp_getattro */
9790 0, /* tp_setattro */
9791 0, /* tp_as_buffer */
9792 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9793 0, /* tp_doc */
9794 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9795 0, /* tp_clear */
9796 0, /* tp_richcompare */
9797 0, /* tp_weaklistoffset */
9798 PyObject_SelfIter, /* tp_iter */
9799 (iternextfunc)unicodeiter_next, /* tp_iternext */
9800 unicodeiter_methods, /* tp_methods */
9801 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009802};
9803
9804static PyObject *
9805unicode_iter(PyObject *seq)
9806{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009808
Benjamin Peterson14339b62009-01-31 16:36:08 +00009809 if (!PyUnicode_Check(seq)) {
9810 PyErr_BadInternalCall();
9811 return NULL;
9812 }
9813 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9814 if (it == NULL)
9815 return NULL;
9816 it->it_index = 0;
9817 Py_INCREF(seq);
9818 it->it_seq = (PyUnicodeObject *)seq;
9819 _PyObject_GC_TRACK(it);
9820 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009821}
9822
Martin v. Löwis5b222132007-06-10 09:51:05 +00009823size_t
9824Py_UNICODE_strlen(const Py_UNICODE *u)
9825{
9826 int res = 0;
9827 while(*u++)
9828 res++;
9829 return res;
9830}
9831
9832Py_UNICODE*
9833Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9834{
9835 Py_UNICODE *u = s1;
9836 while ((*u++ = *s2++));
9837 return s1;
9838}
9839
9840Py_UNICODE*
9841Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9842{
9843 Py_UNICODE *u = s1;
9844 while ((*u++ = *s2++))
9845 if (n-- == 0)
9846 break;
9847 return s1;
9848}
9849
9850int
9851Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9852{
9853 while (*s1 && *s2 && *s1 == *s2)
9854 s1++, s2++;
9855 if (*s1 && *s2)
9856 return (*s1 < *s2) ? -1 : +1;
9857 if (*s1)
9858 return 1;
9859 if (*s2)
9860 return -1;
9861 return 0;
9862}
9863
9864Py_UNICODE*
9865Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9866{
9867 const Py_UNICODE *p;
9868 for (p = s; *p; p++)
9869 if (*p == c)
9870 return (Py_UNICODE*)p;
9871 return NULL;
9872}
9873
9874
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009875#ifdef __cplusplus
9876}
9877#endif
9878
9879
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009880/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009881 Local variables:
9882 c-basic-offset: 4
9883 indent-tabs-mode: nil
9884 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009885*/