blob: c9808e1cc651093942054df811bcf2ef1b6630c3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000794 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* Convert the argument to a bytes object, according to the file
1534 system encoding */
1535
1536int
1537PyUnicode_FSConverter(PyObject* arg, void* addr)
1538{
1539 PyObject *output = NULL;
1540 Py_ssize_t size;
1541 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001542 if (arg == NULL) {
1543 Py_DECREF(*(PyObject**)addr);
1544 return 1;
1545 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001546 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1547 output = arg;
1548 Py_INCREF(output);
1549 }
1550 else {
1551 arg = PyUnicode_FromObject(arg);
1552 if (!arg)
1553 return 0;
1554 output = PyUnicode_AsEncodedObject(arg,
1555 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001556 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001557 Py_DECREF(arg);
1558 if (!output)
1559 return 0;
1560 if (!PyBytes_Check(output)) {
1561 Py_DECREF(output);
1562 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1563 return 0;
1564 }
1565 }
1566 if (PyBytes_Check(output)) {
1567 size = PyBytes_GET_SIZE(output);
1568 data = PyBytes_AS_STRING(output);
1569 }
1570 else {
1571 size = PyByteArray_GET_SIZE(output);
1572 data = PyByteArray_AS_STRING(output);
1573 }
1574 if (size != strlen(data)) {
1575 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1576 Py_DECREF(output);
1577 return 0;
1578 }
1579 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001580 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001581}
1582
1583
Martin v. Löwis5b222132007-06-10 09:51:05 +00001584char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001585_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001586{
Christian Heimesf3863112007-11-22 07:46:41 +00001587 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001588 if (!PyUnicode_Check(unicode)) {
1589 PyErr_BadArgument();
1590 return NULL;
1591 }
Christian Heimesf3863112007-11-22 07:46:41 +00001592 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1593 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001594 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001595 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001596 *psize = PyBytes_GET_SIZE(bytes);
1597 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598}
1599
1600char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001601_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001602{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001603 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001604}
1605
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1607{
1608 if (!PyUnicode_Check(unicode)) {
1609 PyErr_BadArgument();
1610 goto onError;
1611 }
1612 return PyUnicode_AS_UNICODE(unicode);
1613
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 return NULL;
1616}
1617
Martin v. Löwis18e16552006-02-15 17:27:45 +00001618Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619{
1620 if (!PyUnicode_Check(unicode)) {
1621 PyErr_BadArgument();
1622 goto onError;
1623 }
1624 return PyUnicode_GET_SIZE(unicode);
1625
Benjamin Peterson29060642009-01-31 22:14:21 +00001626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 return -1;
1628}
1629
Thomas Wouters78890102000-07-22 19:25:51 +00001630const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001631{
1632 return unicode_default_encoding;
1633}
1634
1635int PyUnicode_SetDefaultEncoding(const char *encoding)
1636{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001637 if (strcmp(encoding, unicode_default_encoding) != 0) {
1638 PyErr_Format(PyExc_ValueError,
1639 "Can only set default encoding to %s",
1640 unicode_default_encoding);
1641 return -1;
1642 }
Fred Drakee4315f52000-05-09 19:53:39 +00001643 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001644}
1645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646/* error handling callback helper:
1647 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001648 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 and adjust various state variables.
1650 return 0 on success, -1 on error
1651*/
1652
1653static
1654int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001655 const char *encoding, const char *reason,
1656 const char **input, const char **inend, Py_ssize_t *startinpos,
1657 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1658 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001660 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661
1662 PyObject *restuple = NULL;
1663 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001664 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001665 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t requiredsize;
1667 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001669 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001670 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 int res = -1;
1672
1673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 *errorHandler = PyCodec_LookupError(errors);
1675 if (*errorHandler == NULL)
1676 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 }
1678
1679 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1682 if (*exceptionObject == NULL)
1683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 }
1685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001686 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1687 goto onError;
1688 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1689 goto onError;
1690 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 }
1693
1694 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1695 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001696 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001698 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 }
1701 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001703
1704 /* Copy back the bytes variables, which might have been modified by the
1705 callback */
1706 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1707 if (!inputobj)
1708 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001709 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001710 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001711 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001712 *input = PyBytes_AS_STRING(inputobj);
1713 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001715 /* we can DECREF safely, as the exception has another reference,
1716 so the object won't go away. */
1717 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001720 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001721 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1723 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725
1726 /* need more space? (at least enough for what we
1727 have+the replacement+the rest of the string (starting
1728 at the new input position), so we won't have to check space
1729 when there are no errors in the rest of the string) */
1730 repptr = PyUnicode_AS_UNICODE(repunicode);
1731 repsize = PyUnicode_GET_SIZE(repunicode);
1732 requiredsize = *outpos + repsize + insize-newpos;
1733 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 if (requiredsize<2*outsize)
1735 requiredsize = 2*outsize;
1736 if (_PyUnicode_Resize(output, requiredsize) < 0)
1737 goto onError;
1738 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 }
1740 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001741 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_UNICODE_COPY(*outptr, repptr, repsize);
1743 *outptr += repsize;
1744 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 /* we made it! */
1747 res = 0;
1748
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 Py_XDECREF(restuple);
1751 return res;
1752}
1753
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754/* --- UTF-7 Codec -------------------------------------------------------- */
1755
Antoine Pitrou244651a2009-05-04 18:56:13 +00001756/* See RFC2152 for details. We encode conservatively and decode liberally. */
1757
1758/* Three simple macros defining base-64. */
1759
1760/* Is c a base-64 character? */
1761
1762#define IS_BASE64(c) \
1763 (((c) >= 'A' && (c) <= 'Z') || \
1764 ((c) >= 'a' && (c) <= 'z') || \
1765 ((c) >= '0' && (c) <= '9') || \
1766 (c) == '+' || (c) == '/')
1767
1768/* given that c is a base-64 character, what is its base-64 value? */
1769
1770#define FROM_BASE64(c) \
1771 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1772 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1773 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1774 (c) == '+' ? 62 : 63)
1775
1776/* What is the base-64 character of the bottom 6 bits of n? */
1777
1778#define TO_BASE64(n) \
1779 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1780
1781/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1782 * decoded as itself. We are permissive on decoding; the only ASCII
1783 * byte not decoding to itself is the + which begins a base64
1784 * string. */
1785
1786#define DECODE_DIRECT(c) \
1787 ((c) <= 127 && (c) != '+')
1788
1789/* The UTF-7 encoder treats ASCII characters differently according to
1790 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1791 * the above). See RFC2152. This array identifies these different
1792 * sets:
1793 * 0 : "Set D"
1794 * alphanumeric and '(),-./:?
1795 * 1 : "Set O"
1796 * !"#$%&*;<=>@[]^_`{|}
1797 * 2 : "whitespace"
1798 * ht nl cr sp
1799 * 3 : special (must be base64 encoded)
1800 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1801 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802
Tim Petersced69f82003-09-16 20:30:58 +00001803static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001804char utf7_category[128] = {
1805/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1806 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1807/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1808 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1809/* sp ! " # $ % & ' ( ) * + , - . / */
1810 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1811/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1813/* @ A B C D E F G H I J K L M N O */
1814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1815/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1817/* ` a b c d e f g h i j k l m n o */
1818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1819/* p q r s t u v w x y z { | } ~ del */
1820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821};
1822
Antoine Pitrou244651a2009-05-04 18:56:13 +00001823/* ENCODE_DIRECT: this character should be encoded as itself. The
1824 * answer depends on whether we are encoding set O as itself, and also
1825 * on whether we are encoding whitespace as itself. RFC2152 makes it
1826 * clear that the answers to these questions vary between
1827 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001828
Antoine Pitrou244651a2009-05-04 18:56:13 +00001829#define ENCODE_DIRECT(c, directO, directWS) \
1830 ((c) < 128 && (c) > 0 && \
1831 ((utf7_category[(c)] == 0) || \
1832 (directWS && (utf7_category[(c)] == 2)) || \
1833 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 Py_ssize_t size,
1837 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001839 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1840}
1841
Antoine Pitrou244651a2009-05-04 18:56:13 +00001842/* The decoder. The only state we preserve is our read position,
1843 * i.e. how many characters we have consumed. So if we end in the
1844 * middle of a shift sequence we have to back off the read position
1845 * and the output to the beginning of the sequence, otherwise we lose
1846 * all the shift state (seen bits, number of bits seen, high
1847 * surrogate). */
1848
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001849PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001850 Py_ssize_t size,
1851 const char *errors,
1852 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001855 Py_ssize_t startinpos;
1856 Py_ssize_t endinpos;
1857 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 const char *e;
1859 PyUnicodeObject *unicode;
1860 Py_UNICODE *p;
1861 const char *errmsg = "";
1862 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001863 Py_UNICODE *shiftOutStart;
1864 unsigned int base64bits = 0;
1865 unsigned long base64buffer = 0;
1866 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 PyObject *errorHandler = NULL;
1868 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869
1870 unicode = _PyUnicode_New(size);
1871 if (!unicode)
1872 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001873 if (size == 0) {
1874 if (consumed)
1875 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001877 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878
1879 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001880 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 e = s + size;
1882
1883 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001885 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001886 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887
Antoine Pitrou244651a2009-05-04 18:56:13 +00001888 if (inShift) { /* in a base-64 section */
1889 if (IS_BASE64(ch)) { /* consume a base-64 character */
1890 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1891 base64bits += 6;
1892 s++;
1893 if (base64bits >= 16) {
1894 /* we have enough bits for a UTF-16 value */
1895 Py_UNICODE outCh = (Py_UNICODE)
1896 (base64buffer >> (base64bits-16));
1897 base64bits -= 16;
1898 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1899 if (surrogate) {
1900 /* expecting a second surrogate */
1901 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1902#ifdef Py_UNICODE_WIDE
1903 *p++ = (((surrogate & 0x3FF)<<10)
1904 | (outCh & 0x3FF)) + 0x10000;
1905#else
1906 *p++ = surrogate;
1907 *p++ = outCh;
1908#endif
1909 surrogate = 0;
1910 }
1911 else {
1912 surrogate = 0;
1913 errmsg = "second surrogate missing";
1914 goto utf7Error;
1915 }
1916 }
1917 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1918 /* first surrogate */
1919 surrogate = outCh;
1920 }
1921 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1922 errmsg = "unexpected second surrogate";
1923 goto utf7Error;
1924 }
1925 else {
1926 *p++ = outCh;
1927 }
1928 }
1929 }
1930 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001931 inShift = 0;
1932 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001933 if (surrogate) {
1934 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001937 if (base64bits > 0) { /* left-over bits */
1938 if (base64bits >= 6) {
1939 /* We've seen at least one base-64 character */
1940 errmsg = "partial character in shift sequence";
1941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001942 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001943 else {
1944 /* Some bits remain; they should be zero */
1945 if (base64buffer != 0) {
1946 errmsg = "non-zero padding bits in shift sequence";
1947 goto utf7Error;
1948 }
1949 }
1950 }
1951 if (ch != '-') {
1952 /* '-' is absorbed; other terminating
1953 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954 *p++ = ch;
1955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956 }
1957 }
1958 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960 s++; /* consume '+' */
1961 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 s++;
1963 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964 }
1965 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001966 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 shiftOutStart = p;
1968 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969 }
1970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001971 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 *p++ = ch;
1973 s++;
1974 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 else {
1976 startinpos = s-starts;
1977 s++;
1978 errmsg = "unexpected special character";
1979 goto utf7Error;
1980 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001981 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001982utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001983 outpos = p-PyUnicode_AS_UNICODE(unicode);
1984 endinpos = s-starts;
1985 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 errors, &errorHandler,
1987 "utf7", errmsg,
1988 &starts, &e, &startinpos, &endinpos, &exc, &s,
1989 &unicode, &outpos, &p))
1990 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001991 }
1992
Antoine Pitrou244651a2009-05-04 18:56:13 +00001993 /* end of string */
1994
1995 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1996 /* if we're in an inconsistent state, that's an error */
1997 if (surrogate ||
1998 (base64bits >= 6) ||
1999 (base64bits > 0 && base64buffer != 0)) {
2000 outpos = p-PyUnicode_AS_UNICODE(unicode);
2001 endinpos = size;
2002 if (unicode_decode_call_errorhandler(
2003 errors, &errorHandler,
2004 "utf7", "unterminated shift sequence",
2005 &starts, &e, &startinpos, &endinpos, &exc, &s,
2006 &unicode, &outpos, &p))
2007 goto onError;
2008 if (s < e)
2009 goto restart;
2010 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012
2013 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002014 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 if (inShift) {
2016 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 }
2019 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002024 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 goto onError;
2026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 Py_XDECREF(errorHandler);
2028 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 return (PyObject *)unicode;
2030
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 Py_XDECREF(errorHandler);
2033 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034 Py_DECREF(unicode);
2035 return NULL;
2036}
2037
2038
2039PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 int base64SetO,
2042 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002045 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002046 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002047 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002049 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002050 unsigned int base64bits = 0;
2051 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002052 char * out;
2053 char * start;
2054
2055 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002057
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002058 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002059 return PyErr_NoMemory();
2060
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 if (v == NULL)
2063 return NULL;
2064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002065 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 for (;i < size; ++i) {
2067 Py_UNICODE ch = s[i];
2068
Antoine Pitrou244651a2009-05-04 18:56:13 +00002069 if (inShift) {
2070 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2071 /* shifting out */
2072 if (base64bits) { /* output remaining bits */
2073 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2074 base64buffer = 0;
2075 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002076 }
2077 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 /* Characters not in the BASE64 set implicitly unshift the sequence
2079 so no '-' is required, except if the character is itself a '-' */
2080 if (IS_BASE64(ch) || ch == '-') {
2081 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002082 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083 *out++ = (char) ch;
2084 }
2085 else {
2086 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002087 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 else { /* not in a shift sequence */
2090 if (ch == '+') {
2091 *out++ = '+';
2092 *out++ = '-';
2093 }
2094 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2095 *out++ = (char) ch;
2096 }
2097 else {
2098 *out++ = '+';
2099 inShift = 1;
2100 goto encode_char;
2101 }
2102 }
2103 continue;
2104encode_char:
2105#ifdef Py_UNICODE_WIDE
2106 if (ch >= 0x10000) {
2107 /* code first surrogate */
2108 base64bits += 16;
2109 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2110 while (base64bits >= 6) {
2111 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2112 base64bits -= 6;
2113 }
2114 /* prepare second surrogate */
2115 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2116 }
2117#endif
2118 base64bits += 16;
2119 base64buffer = (base64buffer << 16) | ch;
2120 while (base64bits >= 6) {
2121 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2122 base64bits -= 6;
2123 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002124 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002125 if (base64bits)
2126 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2127 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002129 if (_PyBytes_Resize(&v, out - start) < 0)
2130 return NULL;
2131 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132}
2133
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134#undef IS_BASE64
2135#undef FROM_BASE64
2136#undef TO_BASE64
2137#undef DECODE_DIRECT
2138#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140/* --- UTF-8 Codec -------------------------------------------------------- */
2141
Tim Petersced69f82003-09-16 20:30:58 +00002142static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143char utf8_code_length[256] = {
2144 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2145 illegal prefix. see RFC 2279 for details */
2146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2158 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2159 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2160 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2161 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2162};
2163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 Py_ssize_t size,
2166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
Walter Dörwald69652032004-09-07 20:24:22 +00002168 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2169}
2170
Antoine Pitrouab868312009-01-10 15:40:25 +00002171/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2172#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2173
2174/* Mask to quickly check whether a C 'long' contains a
2175 non-ASCII, UTF8-encoded char. */
2176#if (SIZEOF_LONG == 8)
2177# define ASCII_CHAR_MASK 0x8080808080808080L
2178#elif (SIZEOF_LONG == 4)
2179# define ASCII_CHAR_MASK 0x80808080L
2180#else
2181# error C 'long' size should be either 4 or 8!
2182#endif
2183
Walter Dörwald69652032004-09-07 20:24:22 +00002184PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002185 Py_ssize_t size,
2186 const char *errors,
2187 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002188{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002191 Py_ssize_t startinpos;
2192 Py_ssize_t endinpos;
2193 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002194 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 PyUnicodeObject *unicode;
2196 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002197 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002198 PyObject *errorHandler = NULL;
2199 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200
2201 /* Note: size will always be longer than the resulting Unicode
2202 character count */
2203 unicode = _PyUnicode_New(size);
2204 if (!unicode)
2205 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002206 if (size == 0) {
2207 if (consumed)
2208 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
2212 /* Unpack UTF-8 encoded data */
2213 p = unicode->str;
2214 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002215 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002218 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219
2220 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002221 /* Fast path for runs of ASCII characters. Given that common UTF-8
2222 input will consist of an overwhelming majority of ASCII
2223 characters, we try to optimize for this case by checking
2224 as many characters as a C 'long' can contain.
2225 First, check if we can do an aligned read, as most CPUs have
2226 a penalty for unaligned reads.
2227 */
2228 if (!((size_t) s & LONG_PTR_MASK)) {
2229 /* Help register allocation */
2230 register const char *_s = s;
2231 register Py_UNICODE *_p = p;
2232 while (_s < aligned_end) {
2233 /* Read a whole long at a time (either 4 or 8 bytes),
2234 and do a fast unrolled copy if it only contains ASCII
2235 characters. */
2236 unsigned long data = *(unsigned long *) _s;
2237 if (data & ASCII_CHAR_MASK)
2238 break;
2239 _p[0] = (unsigned char) _s[0];
2240 _p[1] = (unsigned char) _s[1];
2241 _p[2] = (unsigned char) _s[2];
2242 _p[3] = (unsigned char) _s[3];
2243#if (SIZEOF_LONG == 8)
2244 _p[4] = (unsigned char) _s[4];
2245 _p[5] = (unsigned char) _s[5];
2246 _p[6] = (unsigned char) _s[6];
2247 _p[7] = (unsigned char) _s[7];
2248#endif
2249 _s += SIZEOF_LONG;
2250 _p += SIZEOF_LONG;
2251 }
2252 s = _s;
2253 p = _p;
2254 if (s == e)
2255 break;
2256 ch = (unsigned char)*s;
2257 }
2258 }
2259
2260 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 s++;
2263 continue;
2264 }
2265
2266 n = utf8_code_length[ch];
2267
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002268 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 if (consumed)
2270 break;
2271 else {
2272 errmsg = "unexpected end of data";
2273 startinpos = s-starts;
2274 endinpos = size;
2275 goto utf8Error;
2276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278
2279 switch (n) {
2280
2281 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002282 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 startinpos = s-starts;
2284 endinpos = startinpos+1;
2285 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286
2287 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002288 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 if ((s[1] & 0xc0) != 0x80) {
2295 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002296 startinpos = s-starts;
2297 endinpos = startinpos+2;
2298 goto utf8Error;
2299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002301 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
2303 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002304 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002305 goto utf8Error;
2306 }
2307 else
2308 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 break;
2310
2311 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002312 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002313 (s[2] & 0xc0) != 0x80) {
2314 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002315 startinpos = s-starts;
2316 endinpos = startinpos+3;
2317 goto utf8Error;
2318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002320 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002321 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002322 startinpos = s-starts;
2323 endinpos = startinpos+3;
2324 goto utf8Error;
2325 }
2326 else
2327 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002328 break;
2329
2330 case 4:
2331 if ((s[1] & 0xc0) != 0x80 ||
2332 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002333 (s[3] & 0xc0) != 0x80) {
2334 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002335 startinpos = s-starts;
2336 endinpos = startinpos+4;
2337 goto utf8Error;
2338 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002339 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002341 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002342 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002344 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002345 UTF-16 */
2346 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002347 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 startinpos = s-starts;
2349 endinpos = startinpos+4;
2350 goto utf8Error;
2351 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002352#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002354#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002355 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002356
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002357 /* translate from 10000..10FFFF to 0..FFFF */
2358 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002359
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002360 /* high surrogate = top 10 bits added to D800 */
2361 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002363 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002364 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002365#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 break;
2367
2368 default:
2369 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002370 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002371 startinpos = s-starts;
2372 endinpos = startinpos+n;
2373 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 }
2375 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Benjamin Peterson29060642009-01-31 22:14:21 +00002378 utf8Error:
2379 outpos = p-PyUnicode_AS_UNICODE(unicode);
2380 if (unicode_decode_call_errorhandler(
2381 errors, &errorHandler,
2382 "utf8", errmsg,
2383 &starts, &e, &startinpos, &endinpos, &exc, &s,
2384 &unicode, &outpos, &p))
2385 goto onError;
2386 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 }
Walter Dörwald69652032004-09-07 20:24:22 +00002388 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002392 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 goto onError;
2394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002395 Py_XDECREF(errorHandler);
2396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 return (PyObject *)unicode;
2398
Benjamin Peterson29060642009-01-31 22:14:21 +00002399 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 Py_XDECREF(errorHandler);
2401 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 Py_DECREF(unicode);
2403 return NULL;
2404}
2405
Antoine Pitrouab868312009-01-10 15:40:25 +00002406#undef ASCII_CHAR_MASK
2407
2408
Tim Peters602f7402002-04-27 18:03:26 +00002409/* Allocation strategy: if the string is short, convert into a stack buffer
2410 and allocate exactly as much space needed at the end. Else allocate the
2411 maximum possible needed (4 result bytes per Unicode character), and return
2412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002414PyObject *
2415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 Py_ssize_t size,
2417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418{
Tim Peters602f7402002-04-27 18:03:26 +00002419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002420
Guido van Rossum98297ee2007-11-06 21:34:58 +00002421 Py_ssize_t i; /* index into s of next input byte */
2422 PyObject *result; /* result string object */
2423 char *p; /* next free byte in output buffer */
2424 Py_ssize_t nallocated; /* number of result bytes allocated */
2425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002427 PyObject *errorHandler = NULL;
2428 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002429
Tim Peters602f7402002-04-27 18:03:26 +00002430 assert(s != NULL);
2431 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432
Tim Peters602f7402002-04-27 18:03:26 +00002433 if (size <= MAX_SHORT_UNICHARS) {
2434 /* Write into the stack buffer; nallocated can't overflow.
2435 * At the end, we'll allocate exactly as much heap space as it
2436 * turns out we need.
2437 */
2438 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002439 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002440 p = stackbuf;
2441 }
2442 else {
2443 /* Overallocate on the heap, and give the excess back at the end. */
2444 nallocated = size * 4;
2445 if (nallocated / 4 != size) /* overflow! */
2446 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002447 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002448 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002449 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002450 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002451 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002452
Tim Peters602f7402002-04-27 18:03:26 +00002453 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002454 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002455
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002456 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002457 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002459
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002461 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002462 *p++ = (char)(0xc0 | (ch >> 6));
2463 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002464 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002465 else {
Tim Peters602f7402002-04-27 18:03:26 +00002466 /* Encode UCS2 Unicode ordinals */
2467 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002468#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002469 /* Special case: check for high surrogate */
2470 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2471 Py_UCS4 ch2 = s[i];
2472 /* Check for low surrogate and combine the two to
2473 form a UCS4 value */
2474 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002475 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002476 i++;
2477 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002478 }
Tim Peters602f7402002-04-27 18:03:26 +00002479 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002480 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002481#endif
2482 if (ch >= 0xd800 && ch <= 0xdfff) {
2483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 char *prep;
2486 int k;
2487 rep = unicode_encode_call_errorhandler
2488 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2489 s, size, &exc, i-1, i, &newpos);
2490 if (!rep)
2491 goto error;
2492 /* Implementation limitations: only support error handler that return
2493 bytes, and only support up to four replacement bytes. */
2494 if (!PyBytes_Check(rep)) {
2495 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2496 Py_DECREF(rep);
2497 goto error;
2498 }
2499 if (PyBytes_Size(rep) > 4) {
2500 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2501 Py_DECREF(rep);
2502 goto error;
2503 }
2504 prep = PyBytes_AsString(rep);
2505 for(k = PyBytes_Size(rep); k > 0; k--)
2506 *p++ = *prep++;
2507 Py_DECREF(rep);
2508 continue;
2509
2510 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002511 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002512 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2513 *p++ = (char)(0x80 | (ch & 0x3f));
2514 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002516 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002517 /* Encode UCS4 Unicode ordinals */
2518 *p++ = (char)(0xf0 | (ch >> 18));
2519 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2520 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2521 *p++ = (char)(0x80 | (ch & 0x3f));
2522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002524
Guido van Rossum98297ee2007-11-06 21:34:58 +00002525 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002526 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002527 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002528 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002529 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002530 }
2531 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002532 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002533 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002534 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002535 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002536 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002537 Py_XDECREF(errorHandler);
2538 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002539 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002540 error:
2541 Py_XDECREF(errorHandler);
2542 Py_XDECREF(exc);
2543 Py_XDECREF(result);
2544 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002545
Tim Peters602f7402002-04-27 18:03:26 +00002546#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547}
2548
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 if (!PyUnicode_Check(unicode)) {
2552 PyErr_BadArgument();
2553 return NULL;
2554 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002555 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 PyUnicode_GET_SIZE(unicode),
2557 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558}
2559
Walter Dörwald41980ca2007-08-16 21:55:45 +00002560/* --- UTF-32 Codec ------------------------------------------------------- */
2561
2562PyObject *
2563PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002564 Py_ssize_t size,
2565 const char *errors,
2566 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002567{
2568 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2569}
2570
2571PyObject *
2572PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 Py_ssize_t size,
2574 const char *errors,
2575 int *byteorder,
2576 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002577{
2578 const char *starts = s;
2579 Py_ssize_t startinpos;
2580 Py_ssize_t endinpos;
2581 Py_ssize_t outpos;
2582 PyUnicodeObject *unicode;
2583 Py_UNICODE *p;
2584#ifndef Py_UNICODE_WIDE
2585 int i, pairs;
2586#else
2587 const int pairs = 0;
2588#endif
2589 const unsigned char *q, *e;
2590 int bo = 0; /* assume native ordering by default */
2591 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002592 /* Offsets from q for retrieving bytes in the right order. */
2593#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2594 int iorder[] = {0, 1, 2, 3};
2595#else
2596 int iorder[] = {3, 2, 1, 0};
2597#endif
2598 PyObject *errorHandler = NULL;
2599 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002600 /* On narrow builds we split characters outside the BMP into two
2601 codepoints => count how much extra space we need. */
2602#ifndef Py_UNICODE_WIDE
2603 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 if (((Py_UCS4 *)s)[i] >= 0x10000)
2605 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002606#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002607
2608 /* This might be one to much, because of a BOM */
2609 unicode = _PyUnicode_New((size+3)/4+pairs);
2610 if (!unicode)
2611 return NULL;
2612 if (size == 0)
2613 return (PyObject *)unicode;
2614
2615 /* Unpack UTF-32 encoded data */
2616 p = unicode->str;
2617 q = (unsigned char *)s;
2618 e = q + size;
2619
2620 if (byteorder)
2621 bo = *byteorder;
2622
2623 /* Check for BOM marks (U+FEFF) in the input and adjust current
2624 byte order setting accordingly. In native mode, the leading BOM
2625 mark is skipped, in all other modes, it is copied to the output
2626 stream as-is (giving a ZWNBSP character). */
2627 if (bo == 0) {
2628 if (size >= 4) {
2629 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002631#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 if (bom == 0x0000FEFF) {
2633 q += 4;
2634 bo = -1;
2635 }
2636 else if (bom == 0xFFFE0000) {
2637 q += 4;
2638 bo = 1;
2639 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 if (bom == 0x0000FEFF) {
2642 q += 4;
2643 bo = 1;
2644 }
2645 else if (bom == 0xFFFE0000) {
2646 q += 4;
2647 bo = -1;
2648 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002649#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002651 }
2652
2653 if (bo == -1) {
2654 /* force LE */
2655 iorder[0] = 0;
2656 iorder[1] = 1;
2657 iorder[2] = 2;
2658 iorder[3] = 3;
2659 }
2660 else if (bo == 1) {
2661 /* force BE */
2662 iorder[0] = 3;
2663 iorder[1] = 2;
2664 iorder[2] = 1;
2665 iorder[3] = 0;
2666 }
2667
2668 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 Py_UCS4 ch;
2670 /* remaining bytes at the end? (size should be divisible by 4) */
2671 if (e-q<4) {
2672 if (consumed)
2673 break;
2674 errmsg = "truncated data";
2675 startinpos = ((const char *)q)-starts;
2676 endinpos = ((const char *)e)-starts;
2677 goto utf32Error;
2678 /* The remaining input chars are ignored if the callback
2679 chooses to skip the input */
2680 }
2681 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2682 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002683
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 if (ch >= 0x110000)
2685 {
2686 errmsg = "codepoint not in range(0x110000)";
2687 startinpos = ((const char *)q)-starts;
2688 endinpos = startinpos+4;
2689 goto utf32Error;
2690 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002691#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 if (ch >= 0x10000)
2693 {
2694 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2695 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2696 }
2697 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002698#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 *p++ = ch;
2700 q += 4;
2701 continue;
2702 utf32Error:
2703 outpos = p-PyUnicode_AS_UNICODE(unicode);
2704 if (unicode_decode_call_errorhandler(
2705 errors, &errorHandler,
2706 "utf32", errmsg,
2707 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2708 &unicode, &outpos, &p))
2709 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002710 }
2711
2712 if (byteorder)
2713 *byteorder = bo;
2714
2715 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002716 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002717
2718 /* Adjust length */
2719 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2720 goto onError;
2721
2722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
2724 return (PyObject *)unicode;
2725
Benjamin Peterson29060642009-01-31 22:14:21 +00002726 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002727 Py_DECREF(unicode);
2728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
2730 return NULL;
2731}
2732
2733PyObject *
2734PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002735 Py_ssize_t size,
2736 const char *errors,
2737 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002738{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002739 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002741 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002743 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002744#else
2745 const int pairs = 0;
2746#endif
2747 /* Offsets from p for storing byte pairs in the right order. */
2748#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2749 int iorder[] = {0, 1, 2, 3};
2750#else
2751 int iorder[] = {3, 2, 1, 0};
2752#endif
2753
Benjamin Peterson29060642009-01-31 22:14:21 +00002754#define STORECHAR(CH) \
2755 do { \
2756 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2757 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2758 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2759 p[iorder[0]] = (CH) & 0xff; \
2760 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002761 } while(0)
2762
2763 /* In narrow builds we can output surrogate pairs as one codepoint,
2764 so we need less space. */
2765#ifndef Py_UNICODE_WIDE
2766 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2768 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2769 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002770#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002771 nsize = (size - pairs + (byteorder == 0));
2772 bytesize = nsize * 4;
2773 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002774 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002775 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776 if (v == NULL)
2777 return NULL;
2778
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002779 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002781 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002783 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784
2785 if (byteorder == -1) {
2786 /* force LE */
2787 iorder[0] = 0;
2788 iorder[1] = 1;
2789 iorder[2] = 2;
2790 iorder[3] = 3;
2791 }
2792 else if (byteorder == 1) {
2793 /* force BE */
2794 iorder[0] = 3;
2795 iorder[1] = 2;
2796 iorder[2] = 1;
2797 iorder[3] = 0;
2798 }
2799
2800 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002801 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002802#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2804 Py_UCS4 ch2 = *s;
2805 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2806 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2807 s++;
2808 size--;
2809 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002811#endif
2812 STORECHAR(ch);
2813 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002814
2815 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002816 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002817#undef STORECHAR
2818}
2819
2820PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2821{
2822 if (!PyUnicode_Check(unicode)) {
2823 PyErr_BadArgument();
2824 return NULL;
2825 }
2826 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 PyUnicode_GET_SIZE(unicode),
2828 NULL,
2829 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002830}
2831
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832/* --- UTF-16 Codec ------------------------------------------------------- */
2833
Tim Peters772747b2001-08-09 22:21:55 +00002834PyObject *
2835PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 Py_ssize_t size,
2837 const char *errors,
2838 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839{
Walter Dörwald69652032004-09-07 20:24:22 +00002840 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2841}
2842
Antoine Pitrouab868312009-01-10 15:40:25 +00002843/* Two masks for fast checking of whether a C 'long' may contain
2844 UTF16-encoded surrogate characters. This is an efficient heuristic,
2845 assuming that non-surrogate characters with a code point >= 0x8000 are
2846 rare in most input.
2847 FAST_CHAR_MASK is used when the input is in native byte ordering,
2848 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002849*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002850#if (SIZEOF_LONG == 8)
2851# define FAST_CHAR_MASK 0x8000800080008000L
2852# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2853#elif (SIZEOF_LONG == 4)
2854# define FAST_CHAR_MASK 0x80008000L
2855# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2856#else
2857# error C 'long' size should be either 4 or 8!
2858#endif
2859
Walter Dörwald69652032004-09-07 20:24:22 +00002860PyObject *
2861PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 Py_ssize_t size,
2863 const char *errors,
2864 int *byteorder,
2865 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002866{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t startinpos;
2869 Py_ssize_t endinpos;
2870 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 PyUnicodeObject *unicode;
2872 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002873 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002874 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002875 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002876 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002877 /* Offsets from q for retrieving byte pairs in the right order. */
2878#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2879 int ihi = 1, ilo = 0;
2880#else
2881 int ihi = 0, ilo = 1;
2882#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883 PyObject *errorHandler = NULL;
2884 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885
2886 /* Note: size will always be longer than the resulting Unicode
2887 character count */
2888 unicode = _PyUnicode_New(size);
2889 if (!unicode)
2890 return NULL;
2891 if (size == 0)
2892 return (PyObject *)unicode;
2893
2894 /* Unpack UTF-16 encoded data */
2895 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002896 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002897 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898
2899 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002900 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002902 /* Check for BOM marks (U+FEFF) in the input and adjust current
2903 byte order setting accordingly. In native mode, the leading BOM
2904 mark is skipped, in all other modes, it is copied to the output
2905 stream as-is (giving a ZWNBSP character). */
2906 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002907 if (size >= 2) {
2908 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002909#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 if (bom == 0xFEFF) {
2911 q += 2;
2912 bo = -1;
2913 }
2914 else if (bom == 0xFFFE) {
2915 q += 2;
2916 bo = 1;
2917 }
Tim Petersced69f82003-09-16 20:30:58 +00002918#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 if (bom == 0xFEFF) {
2920 q += 2;
2921 bo = 1;
2922 }
2923 else if (bom == 0xFFFE) {
2924 q += 2;
2925 bo = -1;
2926 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002927#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
Tim Peters772747b2001-08-09 22:21:55 +00002931 if (bo == -1) {
2932 /* force LE */
2933 ihi = 1;
2934 ilo = 0;
2935 }
2936 else if (bo == 1) {
2937 /* force BE */
2938 ihi = 0;
2939 ilo = 1;
2940 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002941#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2942 native_ordering = ilo < ihi;
2943#else
2944 native_ordering = ilo > ihi;
2945#endif
Tim Peters772747b2001-08-09 22:21:55 +00002946
Antoine Pitrouab868312009-01-10 15:40:25 +00002947 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002948 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002950 /* First check for possible aligned read of a C 'long'. Unaligned
2951 reads are more expensive, better to defer to another iteration. */
2952 if (!((size_t) q & LONG_PTR_MASK)) {
2953 /* Fast path for runs of non-surrogate chars. */
2954 register const unsigned char *_q = q;
2955 Py_UNICODE *_p = p;
2956 if (native_ordering) {
2957 /* Native ordering is simple: as long as the input cannot
2958 possibly contain a surrogate char, do an unrolled copy
2959 of several 16-bit code points to the target object.
2960 The non-surrogate check is done on several input bytes
2961 at a time (as many as a C 'long' can contain). */
2962 while (_q < aligned_end) {
2963 unsigned long data = * (unsigned long *) _q;
2964 if (data & FAST_CHAR_MASK)
2965 break;
2966 _p[0] = ((unsigned short *) _q)[0];
2967 _p[1] = ((unsigned short *) _q)[1];
2968#if (SIZEOF_LONG == 8)
2969 _p[2] = ((unsigned short *) _q)[2];
2970 _p[3] = ((unsigned short *) _q)[3];
2971#endif
2972 _q += SIZEOF_LONG;
2973 _p += SIZEOF_LONG / 2;
2974 }
2975 }
2976 else {
2977 /* Byteswapped ordering is similar, but we must decompose
2978 the copy bytewise, and take care of zero'ing out the
2979 upper bytes if the target object is in 32-bit units
2980 (that is, in UCS-4 builds). */
2981 while (_q < aligned_end) {
2982 unsigned long data = * (unsigned long *) _q;
2983 if (data & SWAPPED_FAST_CHAR_MASK)
2984 break;
2985 /* Zero upper bytes in UCS-4 builds */
2986#if (Py_UNICODE_SIZE > 2)
2987 _p[0] = 0;
2988 _p[1] = 0;
2989#if (SIZEOF_LONG == 8)
2990 _p[2] = 0;
2991 _p[3] = 0;
2992#endif
2993#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002994 /* Issue #4916; UCS-4 builds on big endian machines must
2995 fill the two last bytes of each 4-byte unit. */
2996#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2997# define OFF 2
2998#else
2999# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003000#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003001 ((unsigned char *) _p)[OFF + 1] = _q[0];
3002 ((unsigned char *) _p)[OFF + 0] = _q[1];
3003 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3004 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3005#if (SIZEOF_LONG == 8)
3006 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3007 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3008 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3009 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3010#endif
3011#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003012 _q += SIZEOF_LONG;
3013 _p += SIZEOF_LONG / 2;
3014 }
3015 }
3016 p = _p;
3017 q = _q;
3018 if (q >= e)
3019 break;
3020 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022
Benjamin Peterson14339b62009-01-31 16:36:08 +00003023 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003024
3025 if (ch < 0xD800 || ch > 0xDFFF) {
3026 *p++ = ch;
3027 continue;
3028 }
3029
3030 /* UTF-16 code pair: */
3031 if (q > e) {
3032 errmsg = "unexpected end of data";
3033 startinpos = (((const char *)q) - 2) - starts;
3034 endinpos = ((const char *)e) + 1 - starts;
3035 goto utf16Error;
3036 }
3037 if (0xD800 <= ch && ch <= 0xDBFF) {
3038 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3039 q += 2;
3040 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003041#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 *p++ = ch;
3043 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003044#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 continue;
3048 }
3049 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003050 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 startinpos = (((const char *)q)-4)-starts;
3052 endinpos = startinpos+2;
3053 goto utf16Error;
3054 }
3055
Benjamin Peterson14339b62009-01-31 16:36:08 +00003056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 errmsg = "illegal encoding";
3058 startinpos = (((const char *)q)-2)-starts;
3059 endinpos = startinpos+2;
3060 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003061
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 utf16Error:
3063 outpos = p - PyUnicode_AS_UNICODE(unicode);
3064 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003065 errors,
3066 &errorHandler,
3067 "utf16", errmsg,
3068 &starts,
3069 (const char **)&e,
3070 &startinpos,
3071 &endinpos,
3072 &exc,
3073 (const char **)&q,
3074 &unicode,
3075 &outpos,
3076 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003079 /* remaining byte at the end? (size should be even) */
3080 if (e == q) {
3081 if (!consumed) {
3082 errmsg = "truncated data";
3083 startinpos = ((const char *)q) - starts;
3084 endinpos = ((const char *)e) + 1 - starts;
3085 outpos = p - PyUnicode_AS_UNICODE(unicode);
3086 if (unicode_decode_call_errorhandler(
3087 errors,
3088 &errorHandler,
3089 "utf16", errmsg,
3090 &starts,
3091 (const char **)&e,
3092 &startinpos,
3093 &endinpos,
3094 &exc,
3095 (const char **)&q,
3096 &unicode,
3097 &outpos,
3098 &p))
3099 goto onError;
3100 /* The remaining input chars are ignored if the callback
3101 chooses to skip the input */
3102 }
3103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
3105 if (byteorder)
3106 *byteorder = bo;
3107
Walter Dörwald69652032004-09-07 20:24:22 +00003108 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003110
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003112 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 goto onError;
3114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 Py_XDECREF(errorHandler);
3116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117 return (PyObject *)unicode;
3118
Benjamin Peterson29060642009-01-31 22:14:21 +00003119 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003121 Py_XDECREF(errorHandler);
3122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 return NULL;
3124}
3125
Antoine Pitrouab868312009-01-10 15:40:25 +00003126#undef FAST_CHAR_MASK
3127#undef SWAPPED_FAST_CHAR_MASK
3128
Tim Peters772747b2001-08-09 22:21:55 +00003129PyObject *
3130PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 Py_ssize_t size,
3132 const char *errors,
3133 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003135 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003136 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003137 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003138#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003139 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003140#else
3141 const int pairs = 0;
3142#endif
Tim Peters772747b2001-08-09 22:21:55 +00003143 /* Offsets from p for storing byte pairs in the right order. */
3144#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3145 int ihi = 1, ilo = 0;
3146#else
3147 int ihi = 0, ilo = 1;
3148#endif
3149
Benjamin Peterson29060642009-01-31 22:14:21 +00003150#define STORECHAR(CH) \
3151 do { \
3152 p[ihi] = ((CH) >> 8) & 0xff; \
3153 p[ilo] = (CH) & 0xff; \
3154 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003155 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003157#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003158 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 if (s[i] >= 0x10000)
3160 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003161#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003162 /* 2 * (size + pairs + (byteorder == 0)) */
3163 if (size > PY_SSIZE_T_MAX ||
3164 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003166 nsize = size + pairs + (byteorder == 0);
3167 bytesize = nsize * 2;
3168 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003170 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 if (v == NULL)
3172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003174 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003177 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003178 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003179
3180 if (byteorder == -1) {
3181 /* force LE */
3182 ihi = 1;
3183 ilo = 0;
3184 }
3185 else if (byteorder == 1) {
3186 /* force BE */
3187 ihi = 0;
3188 ilo = 1;
3189 }
3190
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003191 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 Py_UNICODE ch = *s++;
3193 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003194#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 if (ch >= 0x10000) {
3196 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3197 ch = 0xD800 | ((ch-0x10000) >> 10);
3198 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003199#endif
Tim Peters772747b2001-08-09 22:21:55 +00003200 STORECHAR(ch);
3201 if (ch2)
3202 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003203 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003204
3205 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003206 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003207#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208}
3209
3210PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3211{
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 return NULL;
3215 }
3216 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 PyUnicode_GET_SIZE(unicode),
3218 NULL,
3219 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220}
3221
3222/* --- Unicode Escape Codec ----------------------------------------------- */
3223
Fredrik Lundh06d12682001-01-24 07:59:11 +00003224static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003225
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 Py_ssize_t size,
3228 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003231 Py_ssize_t startinpos;
3232 Py_ssize_t endinpos;
3233 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003238 char* message;
3239 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 PyObject *errorHandler = NULL;
3241 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 /* Escaped strings will always be longer than the resulting
3244 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 length after conversion to the true value.
3246 (but if the error callback returns a long replacement string
3247 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 v = _PyUnicode_New(size);
3249 if (v == NULL)
3250 goto onError;
3251 if (size == 0)
3252 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003256
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 while (s < end) {
3258 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003259 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
3262 /* Non-escape characters are interpreted as Unicode ordinals */
3263 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003264 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 continue;
3266 }
3267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 /* \ - Escapes */
3270 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003271 c = *s++;
3272 if (s > end)
3273 c = '\0'; /* Invalid after \ */
3274 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 case '\n': break;
3278 case '\\': *p++ = '\\'; break;
3279 case '\'': *p++ = '\''; break;
3280 case '\"': *p++ = '\"'; break;
3281 case 'b': *p++ = '\b'; break;
3282 case 'f': *p++ = '\014'; break; /* FF */
3283 case 't': *p++ = '\t'; break;
3284 case 'n': *p++ = '\n'; break;
3285 case 'r': *p++ = '\r'; break;
3286 case 'v': *p++ = '\013'; break; /* VT */
3287 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3288
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 case '0': case '1': case '2': case '3':
3291 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003292 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003293 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003294 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003295 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003296 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003298 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 break;
3300
Benjamin Peterson29060642009-01-31 22:14:21 +00003301 /* hex escapes */
3302 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003304 digits = 2;
3305 message = "truncated \\xXX escape";
3306 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003310 digits = 4;
3311 message = "truncated \\uXXXX escape";
3312 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003315 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003316 digits = 8;
3317 message = "truncated \\UXXXXXXXX escape";
3318 hexescape:
3319 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 outpos = p-PyUnicode_AS_UNICODE(v);
3321 if (s+digits>end) {
3322 endinpos = size;
3323 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 errors, &errorHandler,
3325 "unicodeescape", "end of string in escape sequence",
3326 &starts, &end, &startinpos, &endinpos, &exc, &s,
3327 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 goto onError;
3329 goto nextByte;
3330 }
3331 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003332 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003333 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334 endinpos = (s+i+1)-starts;
3335 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 errors, &errorHandler,
3337 "unicodeescape", message,
3338 &starts, &end, &startinpos, &endinpos, &exc, &s,
3339 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003340 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003342 }
3343 chr = (chr<<4) & ~0xF;
3344 if (c >= '0' && c <= '9')
3345 chr += c - '0';
3346 else if (c >= 'a' && c <= 'f')
3347 chr += 10 + c - 'a';
3348 else
3349 chr += 10 + c - 'A';
3350 }
3351 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003352 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 /* _decoding_error will have already written into the
3354 target buffer. */
3355 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003356 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003357 /* when we get here, chr is a 32-bit unicode character */
3358 if (chr <= 0xffff)
3359 /* UCS-2 character */
3360 *p++ = (Py_UNICODE) chr;
3361 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003362 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003363 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003364#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003365 *p++ = chr;
3366#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003367 chr -= 0x10000L;
3368 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003369 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003370#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003371 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 endinpos = s-starts;
3373 outpos = p-PyUnicode_AS_UNICODE(v);
3374 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 errors, &errorHandler,
3376 "unicodeescape", "illegal Unicode character",
3377 &starts, &end, &startinpos, &endinpos, &exc, &s,
3378 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003379 goto onError;
3380 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003381 break;
3382
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003384 case 'N':
3385 message = "malformed \\N character escape";
3386 if (ucnhash_CAPI == NULL) {
3387 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003388 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003389 if (ucnhash_CAPI == NULL)
3390 goto ucnhashError;
3391 }
3392 if (*s == '{') {
3393 const char *start = s+1;
3394 /* look for the closing brace */
3395 while (*s != '}' && s < end)
3396 s++;
3397 if (s > start && s < end && *s == '}') {
3398 /* found a name. look it up in the unicode database */
3399 message = "unknown Unicode character name";
3400 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003401 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003402 goto store;
3403 }
3404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 endinpos = s-starts;
3406 outpos = p-PyUnicode_AS_UNICODE(v);
3407 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 errors, &errorHandler,
3409 "unicodeescape", message,
3410 &starts, &end, &startinpos, &endinpos, &exc, &s,
3411 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003412 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003413 break;
3414
3415 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003416 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 message = "\\ at end of string";
3418 s--;
3419 endinpos = s-starts;
3420 outpos = p-PyUnicode_AS_UNICODE(v);
3421 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 errors, &errorHandler,
3423 "unicodeescape", message,
3424 &starts, &end, &startinpos, &endinpos, &exc, &s,
3425 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003426 goto onError;
3427 }
3428 else {
3429 *p++ = '\\';
3430 *p++ = (unsigned char)s[-1];
3431 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003432 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003437 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003439 Py_XDECREF(errorHandler);
3440 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003444 PyErr_SetString(
3445 PyExc_UnicodeError,
3446 "\\N escapes not supported (can't load unicodedata module)"
3447 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003448 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 Py_XDECREF(errorHandler);
3450 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003451 return NULL;
3452
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 Py_XDECREF(errorHandler);
3456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 return NULL;
3458}
3459
3460/* Return a Unicode-Escape string version of the Unicode object.
3461
3462 If quotes is true, the string is enclosed in u"" or u'' quotes as
3463 appropriate.
3464
3465*/
3466
Thomas Wouters477c8d52006-05-27 19:21:47 +00003467Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 Py_ssize_t size,
3469 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003470{
3471 /* like wcschr, but doesn't stop at NULL characters */
3472
3473 while (size-- > 0) {
3474 if (*s == ch)
3475 return s;
3476 s++;
3477 }
3478
3479 return NULL;
3480}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003481
Walter Dörwald79e913e2007-05-12 11:08:06 +00003482static const char *hexdigits = "0123456789abcdef";
3483
3484PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003487 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003490#ifdef Py_UNICODE_WIDE
3491 const Py_ssize_t expandsize = 10;
3492#else
3493 const Py_ssize_t expandsize = 6;
3494#endif
3495
Thomas Wouters89f507f2006-12-13 04:49:30 +00003496 /* XXX(nnorwitz): rather than over-allocating, it would be
3497 better to choose a different scheme. Perhaps scan the
3498 first N-chars of the string and allocate based on that size.
3499 */
3500 /* Initial allocation is based on the longest-possible unichr
3501 escape.
3502
3503 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3504 unichr, so in this case it's the longest unichr escape. In
3505 narrow (UTF-16) builds this is five chars per source unichr
3506 since there are two unichrs in the surrogate pair, so in narrow
3507 (UTF-16) builds it's not the longest unichr escape.
3508
3509 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3510 so in the narrow (UTF-16) build case it's the longest unichr
3511 escape.
3512 */
3513
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003514 if (size == 0)
3515 return PyBytes_FromStringAndSize(NULL, 0);
3516
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003517 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003519
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003520 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 2
3522 + expandsize*size
3523 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 if (repr == NULL)
3525 return NULL;
3526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003527 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 while (size-- > 0) {
3530 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003531
Walter Dörwald79e913e2007-05-12 11:08:06 +00003532 /* Escape backslashes */
3533 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 *p++ = '\\';
3535 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003536 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003537 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003538
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003539#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003540 /* Map 21-bit characters to '\U00xxxxxx' */
3541 else if (ch >= 0x10000) {
3542 *p++ = '\\';
3543 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003544 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3545 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3546 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3547 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3548 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3549 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3550 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3551 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003553 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003554#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3556 else if (ch >= 0xD800 && ch < 0xDC00) {
3557 Py_UNICODE ch2;
3558 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003559
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 ch2 = *s++;
3561 size--;
3562 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3563 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3564 *p++ = '\\';
3565 *p++ = 'U';
3566 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3567 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3568 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3569 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3570 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3571 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3572 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3573 *p++ = hexdigits[ucs & 0x0000000F];
3574 continue;
3575 }
3576 /* Fall through: isolated surrogates are copied as-is */
3577 s--;
3578 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003579 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003580#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003581
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003583 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 *p++ = '\\';
3585 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003586 *p++ = hexdigits[(ch >> 12) & 0x000F];
3587 *p++ = hexdigits[(ch >> 8) & 0x000F];
3588 *p++ = hexdigits[(ch >> 4) & 0x000F];
3589 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003591
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003592 /* Map special whitespace to '\t', \n', '\r' */
3593 else if (ch == '\t') {
3594 *p++ = '\\';
3595 *p++ = 't';
3596 }
3597 else if (ch == '\n') {
3598 *p++ = '\\';
3599 *p++ = 'n';
3600 }
3601 else if (ch == '\r') {
3602 *p++ = '\\';
3603 *p++ = 'r';
3604 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003605
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003606 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003607 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003609 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003610 *p++ = hexdigits[(ch >> 4) & 0x000F];
3611 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003612 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003613
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 /* Copy everything else as-is */
3615 else
3616 *p++ = (char) ch;
3617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003619 assert(p - PyBytes_AS_STRING(repr) > 0);
3620 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3621 return NULL;
3622 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623}
3624
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003625PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003627 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 if (!PyUnicode_Check(unicode)) {
3629 PyErr_BadArgument();
3630 return NULL;
3631 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003632 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3633 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003634 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635}
3636
3637/* --- Raw Unicode Escape Codec ------------------------------------------- */
3638
3639PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 Py_ssize_t size,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003644 Py_ssize_t startinpos;
3645 Py_ssize_t endinpos;
3646 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 const char *end;
3650 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 PyObject *errorHandler = NULL;
3652 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003653
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 /* Escaped strings will always be longer than the resulting
3655 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 length after conversion to the true value. (But decoding error
3657 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 v = _PyUnicode_New(size);
3659 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 end = s + size;
3665 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 unsigned char c;
3667 Py_UCS4 x;
3668 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003669 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 /* Non-escape characters are interpreted as Unicode ordinals */
3672 if (*s != '\\') {
3673 *p++ = (unsigned char)*s++;
3674 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 startinpos = s-starts;
3677
3678 /* \u-escapes are only interpreted iff the number of leading
3679 backslashes if odd */
3680 bs = s;
3681 for (;s < end;) {
3682 if (*s != '\\')
3683 break;
3684 *p++ = (unsigned char)*s++;
3685 }
3686 if (((s - bs) & 1) == 0 ||
3687 s >= end ||
3688 (*s != 'u' && *s != 'U')) {
3689 continue;
3690 }
3691 p--;
3692 count = *s=='u' ? 4 : 8;
3693 s++;
3694
3695 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3696 outpos = p-PyUnicode_AS_UNICODE(v);
3697 for (x = 0, i = 0; i < count; ++i, ++s) {
3698 c = (unsigned char)*s;
3699 if (!ISXDIGIT(c)) {
3700 endinpos = s-starts;
3701 if (unicode_decode_call_errorhandler(
3702 errors, &errorHandler,
3703 "rawunicodeescape", "truncated \\uXXXX",
3704 &starts, &end, &startinpos, &endinpos, &exc, &s,
3705 &v, &outpos, &p))
3706 goto onError;
3707 goto nextByte;
3708 }
3709 x = (x<<4) & ~0xF;
3710 if (c >= '0' && c <= '9')
3711 x += c - '0';
3712 else if (c >= 'a' && c <= 'f')
3713 x += 10 + c - 'a';
3714 else
3715 x += 10 + c - 'A';
3716 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003717 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 /* UCS-2 character */
3719 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003720 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 /* UCS-4 character. Either store directly, or as
3722 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003723#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003725#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 x -= 0x10000L;
3727 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3728 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003729#endif
3730 } else {
3731 endinpos = s-starts;
3732 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003733 if (unicode_decode_call_errorhandler(
3734 errors, &errorHandler,
3735 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 &starts, &end, &startinpos, &endinpos, &exc, &s,
3737 &v, &outpos, &p))
3738 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 nextByte:
3741 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003743 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003744 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 Py_XDECREF(errorHandler);
3746 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003748
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 Py_XDECREF(errorHandler);
3752 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 return NULL;
3754}
3755
3756PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003759 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 char *p;
3761 char *q;
3762
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003763#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003764 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003765#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003766 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003767#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003768
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003769 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003771
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003772 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 if (repr == NULL)
3774 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003775 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003776 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003778 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 while (size-- > 0) {
3780 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003781#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 /* Map 32-bit characters to '\Uxxxxxxxx' */
3783 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003784 *p++ = '\\';
3785 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003786 *p++ = hexdigits[(ch >> 28) & 0xf];
3787 *p++ = hexdigits[(ch >> 24) & 0xf];
3788 *p++ = hexdigits[(ch >> 20) & 0xf];
3789 *p++ = hexdigits[(ch >> 16) & 0xf];
3790 *p++ = hexdigits[(ch >> 12) & 0xf];
3791 *p++ = hexdigits[(ch >> 8) & 0xf];
3792 *p++ = hexdigits[(ch >> 4) & 0xf];
3793 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003794 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003795 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003796#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003797 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3798 if (ch >= 0xD800 && ch < 0xDC00) {
3799 Py_UNICODE ch2;
3800 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003801
Benjamin Peterson29060642009-01-31 22:14:21 +00003802 ch2 = *s++;
3803 size--;
3804 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3805 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3806 *p++ = '\\';
3807 *p++ = 'U';
3808 *p++ = hexdigits[(ucs >> 28) & 0xf];
3809 *p++ = hexdigits[(ucs >> 24) & 0xf];
3810 *p++ = hexdigits[(ucs >> 20) & 0xf];
3811 *p++ = hexdigits[(ucs >> 16) & 0xf];
3812 *p++ = hexdigits[(ucs >> 12) & 0xf];
3813 *p++ = hexdigits[(ucs >> 8) & 0xf];
3814 *p++ = hexdigits[(ucs >> 4) & 0xf];
3815 *p++ = hexdigits[ucs & 0xf];
3816 continue;
3817 }
3818 /* Fall through: isolated surrogates are copied as-is */
3819 s--;
3820 size++;
3821 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003822#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 /* Map 16-bit characters to '\uxxxx' */
3824 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 *p++ = '\\';
3826 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003827 *p++ = hexdigits[(ch >> 12) & 0xf];
3828 *p++ = hexdigits[(ch >> 8) & 0xf];
3829 *p++ = hexdigits[(ch >> 4) & 0xf];
3830 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 /* Copy everything else as-is */
3833 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 *p++ = (char) ch;
3835 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003836 size = p - q;
3837
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003838 assert(size > 0);
3839 if (_PyBytes_Resize(&repr, size) < 0)
3840 return NULL;
3841 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842}
3843
3844PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3845{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003846 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003848 PyErr_BadArgument();
3849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003851 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3852 PyUnicode_GET_SIZE(unicode));
3853
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003854 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855}
3856
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003857/* --- Unicode Internal Codec ------------------------------------------- */
3858
3859PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 Py_ssize_t size,
3861 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003862{
3863 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003864 Py_ssize_t startinpos;
3865 Py_ssize_t endinpos;
3866 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003867 PyUnicodeObject *v;
3868 Py_UNICODE *p;
3869 const char *end;
3870 const char *reason;
3871 PyObject *errorHandler = NULL;
3872 PyObject *exc = NULL;
3873
Neal Norwitzd43069c2006-01-08 01:12:10 +00003874#ifdef Py_UNICODE_WIDE
3875 Py_UNICODE unimax = PyUnicode_GetMax();
3876#endif
3877
Thomas Wouters89f507f2006-12-13 04:49:30 +00003878 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003879 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3880 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003882 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003884 p = PyUnicode_AS_UNICODE(v);
3885 end = s + size;
3886
3887 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003888 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003889 /* We have to sanity check the raw data, otherwise doom looms for
3890 some malformed UCS-4 data. */
3891 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003892#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003893 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003894#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003895 end-s < Py_UNICODE_SIZE
3896 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003898 startinpos = s - starts;
3899 if (end-s < Py_UNICODE_SIZE) {
3900 endinpos = end-starts;
3901 reason = "truncated input";
3902 }
3903 else {
3904 endinpos = s - starts + Py_UNICODE_SIZE;
3905 reason = "illegal code point (> 0x10FFFF)";
3906 }
3907 outpos = p - PyUnicode_AS_UNICODE(v);
3908 if (unicode_decode_call_errorhandler(
3909 errors, &errorHandler,
3910 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003911 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003912 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003913 goto onError;
3914 }
3915 }
3916 else {
3917 p++;
3918 s += Py_UNICODE_SIZE;
3919 }
3920 }
3921
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003922 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003923 goto onError;
3924 Py_XDECREF(errorHandler);
3925 Py_XDECREF(exc);
3926 return (PyObject *)v;
3927
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003929 Py_XDECREF(v);
3930 Py_XDECREF(errorHandler);
3931 Py_XDECREF(exc);
3932 return NULL;
3933}
3934
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935/* --- Latin-1 Codec ------------------------------------------------------ */
3936
3937PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 Py_ssize_t size,
3939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940{
3941 PyUnicodeObject *v;
3942 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003943 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003944
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003946 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003947 Py_UNICODE r = *(unsigned char*)s;
3948 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003949 }
3950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 v = _PyUnicode_New(size);
3952 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003957 e = s + size;
3958 /* Unrolling the copy makes it much faster by reducing the looping
3959 overhead. This is similar to what many memcpy() implementations do. */
3960 unrolled_end = e - 4;
3961 while (s < unrolled_end) {
3962 p[0] = (unsigned char) s[0];
3963 p[1] = (unsigned char) s[1];
3964 p[2] = (unsigned char) s[2];
3965 p[3] = (unsigned char) s[3];
3966 s += 4;
3967 p += 4;
3968 }
3969 while (s < e)
3970 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003972
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 Py_XDECREF(v);
3975 return NULL;
3976}
3977
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978/* create or adjust a UnicodeEncodeError */
3979static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 const char *encoding,
3981 const Py_UNICODE *unicode, Py_ssize_t size,
3982 Py_ssize_t startpos, Py_ssize_t endpos,
3983 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 *exceptionObject = PyUnicodeEncodeError_Create(
3987 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 }
3989 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3991 goto onError;
3992 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3993 goto onError;
3994 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3995 goto onError;
3996 return;
3997 onError:
3998 Py_DECREF(*exceptionObject);
3999 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 }
4001}
4002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003/* raises a UnicodeEncodeError */
4004static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 const char *encoding,
4006 const Py_UNICODE *unicode, Py_ssize_t size,
4007 Py_ssize_t startpos, Py_ssize_t endpos,
4008 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009{
4010 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014}
4015
4016/* error handling callback helper:
4017 build arguments, call the callback and check the arguments,
4018 put the result into newpos and return the replacement string, which
4019 has to be freed by the caller */
4020static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 PyObject **errorHandler,
4022 const char *encoding, const char *reason,
4023 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4024 Py_ssize_t startpos, Py_ssize_t endpos,
4025 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004027 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028
4029 PyObject *restuple;
4030 PyObject *resunicode;
4031
4032 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004033 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 }
4037
4038 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042
4043 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004048 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 Py_DECREF(restuple);
4050 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004052 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 &resunicode, newpos)) {
4054 Py_DECREF(restuple);
4055 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004057 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4058 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4059 Py_DECREF(restuple);
4060 return NULL;
4061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004064 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4066 Py_DECREF(restuple);
4067 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004068 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 Py_INCREF(resunicode);
4070 Py_DECREF(restuple);
4071 return resunicode;
4072}
4073
4074static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 Py_ssize_t size,
4076 const char *errors,
4077 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078{
4079 /* output object */
4080 PyObject *res;
4081 /* pointers to the beginning and end+1 of input */
4082 const Py_UNICODE *startp = p;
4083 const Py_UNICODE *endp = p + size;
4084 /* pointer to the beginning of the unencodable characters */
4085 /* const Py_UNICODE *badp = NULL; */
4086 /* pointer into the output */
4087 char *str;
4088 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004089 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004090 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4091 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 PyObject *errorHandler = NULL;
4093 PyObject *exc = NULL;
4094 /* the following variable is used for caching string comparisons
4095 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4096 int known_errorHandler = -1;
4097
4098 /* allocate enough for a simple encoding without
4099 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004100 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004101 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004102 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004104 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004105 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 ressize = size;
4107
4108 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 /* can we encode this? */
4112 if (c<limit) {
4113 /* no overflow check, because we know that the space is enough */
4114 *str++ = (char)c;
4115 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 else {
4118 Py_ssize_t unicodepos = p-startp;
4119 Py_ssize_t requiredsize;
4120 PyObject *repunicode;
4121 Py_ssize_t repsize;
4122 Py_ssize_t newpos;
4123 Py_ssize_t respos;
4124 Py_UNICODE *uni2;
4125 /* startpos for collecting unencodable chars */
4126 const Py_UNICODE *collstart = p;
4127 const Py_UNICODE *collend = p;
4128 /* find all unecodable characters */
4129 while ((collend < endp) && ((*collend)>=limit))
4130 ++collend;
4131 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4132 if (known_errorHandler==-1) {
4133 if ((errors==NULL) || (!strcmp(errors, "strict")))
4134 known_errorHandler = 1;
4135 else if (!strcmp(errors, "replace"))
4136 known_errorHandler = 2;
4137 else if (!strcmp(errors, "ignore"))
4138 known_errorHandler = 3;
4139 else if (!strcmp(errors, "xmlcharrefreplace"))
4140 known_errorHandler = 4;
4141 else
4142 known_errorHandler = 0;
4143 }
4144 switch (known_errorHandler) {
4145 case 1: /* strict */
4146 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4147 goto onError;
4148 case 2: /* replace */
4149 while (collstart++<collend)
4150 *str++ = '?'; /* fall through */
4151 case 3: /* ignore */
4152 p = collend;
4153 break;
4154 case 4: /* xmlcharrefreplace */
4155 respos = str - PyBytes_AS_STRING(res);
4156 /* determine replacement size (temporarily (mis)uses p) */
4157 for (p = collstart, repsize = 0; p < collend; ++p) {
4158 if (*p<10)
4159 repsize += 2+1+1;
4160 else if (*p<100)
4161 repsize += 2+2+1;
4162 else if (*p<1000)
4163 repsize += 2+3+1;
4164 else if (*p<10000)
4165 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004166#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 else
4168 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004169#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 else if (*p<100000)
4171 repsize += 2+5+1;
4172 else if (*p<1000000)
4173 repsize += 2+6+1;
4174 else
4175 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004176#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 }
4178 requiredsize = respos+repsize+(endp-collend);
4179 if (requiredsize > ressize) {
4180 if (requiredsize<2*ressize)
4181 requiredsize = 2*ressize;
4182 if (_PyBytes_Resize(&res, requiredsize))
4183 goto onError;
4184 str = PyBytes_AS_STRING(res) + respos;
4185 ressize = requiredsize;
4186 }
4187 /* generate replacement (temporarily (mis)uses p) */
4188 for (p = collstart; p < collend; ++p) {
4189 str += sprintf(str, "&#%d;", (int)*p);
4190 }
4191 p = collend;
4192 break;
4193 default:
4194 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4195 encoding, reason, startp, size, &exc,
4196 collstart-startp, collend-startp, &newpos);
4197 if (repunicode == NULL)
4198 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004199 if (PyBytes_Check(repunicode)) {
4200 /* Directly copy bytes result to output. */
4201 repsize = PyBytes_Size(repunicode);
4202 if (repsize > 1) {
4203 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004204 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004205 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4206 Py_DECREF(repunicode);
4207 goto onError;
4208 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004209 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004210 ressize += repsize-1;
4211 }
4212 memcpy(str, PyBytes_AsString(repunicode), repsize);
4213 str += repsize;
4214 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004215 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004216 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004217 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 /* need more space? (at least enough for what we
4219 have+the replacement+the rest of the string, so
4220 we won't have to check space for encodable characters) */
4221 respos = str - PyBytes_AS_STRING(res);
4222 repsize = PyUnicode_GET_SIZE(repunicode);
4223 requiredsize = respos+repsize+(endp-collend);
4224 if (requiredsize > ressize) {
4225 if (requiredsize<2*ressize)
4226 requiredsize = 2*ressize;
4227 if (_PyBytes_Resize(&res, requiredsize)) {
4228 Py_DECREF(repunicode);
4229 goto onError;
4230 }
4231 str = PyBytes_AS_STRING(res) + respos;
4232 ressize = requiredsize;
4233 }
4234 /* check if there is anything unencodable in the replacement
4235 and copy it to the output */
4236 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4237 c = *uni2;
4238 if (c >= limit) {
4239 raise_encode_exception(&exc, encoding, startp, size,
4240 unicodepos, unicodepos+1, reason);
4241 Py_DECREF(repunicode);
4242 goto onError;
4243 }
4244 *str = (char)c;
4245 }
4246 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004247 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004248 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004249 }
4250 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004251 /* Resize if we allocated to much */
4252 size = str - PyBytes_AS_STRING(res);
4253 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004254 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004255 if (_PyBytes_Resize(&res, size) < 0)
4256 goto onError;
4257 }
4258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 Py_XDECREF(errorHandler);
4260 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004261 return res;
4262
4263 onError:
4264 Py_XDECREF(res);
4265 Py_XDECREF(errorHandler);
4266 Py_XDECREF(exc);
4267 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268}
4269
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 Py_ssize_t size,
4272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275}
4276
4277PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4278{
4279 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 PyErr_BadArgument();
4281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 }
4283 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 PyUnicode_GET_SIZE(unicode),
4285 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286}
4287
4288/* --- 7-bit ASCII Codec -------------------------------------------------- */
4289
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 Py_ssize_t size,
4292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 PyUnicodeObject *v;
4296 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004297 Py_ssize_t startinpos;
4298 Py_ssize_t endinpos;
4299 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 const char *e;
4301 PyObject *errorHandler = NULL;
4302 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004303
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004305 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 Py_UNICODE r = *(unsigned char*)s;
4307 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004308 }
Tim Petersced69f82003-09-16 20:30:58 +00004309
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 v = _PyUnicode_New(size);
4311 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 e = s + size;
4317 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 register unsigned char c = (unsigned char)*s;
4319 if (c < 128) {
4320 *p++ = c;
4321 ++s;
4322 }
4323 else {
4324 startinpos = s-starts;
4325 endinpos = startinpos + 1;
4326 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4327 if (unicode_decode_call_errorhandler(
4328 errors, &errorHandler,
4329 "ascii", "ordinal not in range(128)",
4330 &starts, &e, &startinpos, &endinpos, &exc, &s,
4331 &v, &outpos, &p))
4332 goto onError;
4333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004335 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4337 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 Py_XDECREF(errorHandler);
4339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004341
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 Py_XDECREF(errorHandler);
4345 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 return NULL;
4347}
4348
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 Py_ssize_t size,
4351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354}
4355
4356PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4357{
4358 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 PyErr_BadArgument();
4360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 }
4362 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 PyUnicode_GET_SIZE(unicode),
4364 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365}
4366
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004367#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004368
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004369/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004370
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004371#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004372#define NEED_RETRY
4373#endif
4374
4375/* XXX This code is limited to "true" double-byte encodings, as
4376 a) it assumes an incomplete character consists of a single byte, and
4377 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004379
4380static int is_dbcs_lead_byte(const char *s, int offset)
4381{
4382 const char *curr = s + offset;
4383
4384 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 const char *prev = CharPrev(s, curr);
4386 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004387 }
4388 return 0;
4389}
4390
4391/*
4392 * Decode MBCS string into unicode object. If 'final' is set, converts
4393 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4394 */
4395static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 const char *s, /* MBCS string */
4397 int size, /* sizeof MBCS string */
4398 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004399{
4400 Py_UNICODE *p;
4401 Py_ssize_t n = 0;
4402 int usize = 0;
4403
4404 assert(size >= 0);
4405
4406 /* Skip trailing lead-byte unless 'final' is set */
4407 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004409
4410 /* First get the size of the result */
4411 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4413 if (usize == 0) {
4414 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4415 return -1;
4416 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004417 }
4418
4419 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 /* Create unicode object */
4421 *v = _PyUnicode_New(usize);
4422 if (*v == NULL)
4423 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004424 }
4425 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 /* Extend unicode object */
4427 n = PyUnicode_GET_SIZE(*v);
4428 if (_PyUnicode_Resize(v, n + usize) < 0)
4429 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004430 }
4431
4432 /* Do the conversion */
4433 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 p = PyUnicode_AS_UNICODE(*v) + n;
4435 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4436 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4437 return -1;
4438 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004439 }
4440
4441 return size;
4442}
4443
4444PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 Py_ssize_t size,
4446 const char *errors,
4447 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004448{
4449 PyUnicodeObject *v = NULL;
4450 int done;
4451
4452 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004454
4455#ifdef NEED_RETRY
4456 retry:
4457 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004459 else
4460#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004462
4463 if (done < 0) {
4464 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004466 }
4467
4468 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004470
4471#ifdef NEED_RETRY
4472 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 s += done;
4474 size -= done;
4475 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004476 }
4477#endif
4478
4479 return (PyObject *)v;
4480}
4481
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004482PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 Py_ssize_t size,
4484 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004485{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004486 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4487}
4488
4489/*
4490 * Convert unicode into string object (MBCS).
4491 * Returns 0 if succeed, -1 otherwise.
4492 */
4493static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 const Py_UNICODE *p, /* unicode */
4495 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004496{
4497 int mbcssize = 0;
4498 Py_ssize_t n = 0;
4499
4500 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004501
4502 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004503 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4505 if (mbcssize == 0) {
4506 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4507 return -1;
4508 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004509 }
4510
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004511 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 /* Create string object */
4513 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4514 if (*repr == NULL)
4515 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004516 }
4517 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 /* Extend string object */
4519 n = PyBytes_Size(*repr);
4520 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4521 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004522 }
4523
4524 /* Do the conversion */
4525 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 char *s = PyBytes_AS_STRING(*repr) + n;
4527 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4528 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4529 return -1;
4530 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004531 }
4532
4533 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004534}
4535
4536PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 Py_ssize_t size,
4538 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004539{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004540 PyObject *repr = NULL;
4541 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004543#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 else
4548#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004550
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004551 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 Py_XDECREF(repr);
4553 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004554 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555
4556#ifdef NEED_RETRY
4557 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 p += INT_MAX;
4559 size -= INT_MAX;
4560 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004561 }
4562#endif
4563
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004564 return repr;
4565}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004566
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004567PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4568{
4569 if (!PyUnicode_Check(unicode)) {
4570 PyErr_BadArgument();
4571 return NULL;
4572 }
4573 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 PyUnicode_GET_SIZE(unicode),
4575 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004576}
4577
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578#undef NEED_RETRY
4579
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004580#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004581
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582/* --- Character Mapping Codec -------------------------------------------- */
4583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 Py_ssize_t size,
4586 PyObject *mapping,
4587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004590 Py_ssize_t startinpos;
4591 Py_ssize_t endinpos;
4592 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 PyUnicodeObject *v;
4595 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 PyObject *errorHandler = NULL;
4598 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004599 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004600 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004601
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 /* Default to Latin-1 */
4603 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605
4606 v = _PyUnicode_New(size);
4607 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004613 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 mapstring = PyUnicode_AS_UNICODE(mapping);
4615 maplen = PyUnicode_GET_SIZE(mapping);
4616 while (s < e) {
4617 unsigned char ch = *s;
4618 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 if (ch < maplen)
4621 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 if (x == 0xfffe) {
4624 /* undefined mapping */
4625 outpos = p-PyUnicode_AS_UNICODE(v);
4626 startinpos = s-starts;
4627 endinpos = startinpos+1;
4628 if (unicode_decode_call_errorhandler(
4629 errors, &errorHandler,
4630 "charmap", "character maps to <undefined>",
4631 &starts, &e, &startinpos, &endinpos, &exc, &s,
4632 &v, &outpos, &p)) {
4633 goto onError;
4634 }
4635 continue;
4636 }
4637 *p++ = x;
4638 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004639 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004640 }
4641 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 while (s < e) {
4643 unsigned char ch = *s;
4644 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004645
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4647 w = PyLong_FromLong((long)ch);
4648 if (w == NULL)
4649 goto onError;
4650 x = PyObject_GetItem(mapping, w);
4651 Py_DECREF(w);
4652 if (x == NULL) {
4653 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4654 /* No mapping found means: mapping is undefined. */
4655 PyErr_Clear();
4656 x = Py_None;
4657 Py_INCREF(x);
4658 } else
4659 goto onError;
4660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004661
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 /* Apply mapping */
4663 if (PyLong_Check(x)) {
4664 long value = PyLong_AS_LONG(x);
4665 if (value < 0 || value > 65535) {
4666 PyErr_SetString(PyExc_TypeError,
4667 "character mapping must be in range(65536)");
4668 Py_DECREF(x);
4669 goto onError;
4670 }
4671 *p++ = (Py_UNICODE)value;
4672 }
4673 else if (x == Py_None) {
4674 /* undefined mapping */
4675 outpos = p-PyUnicode_AS_UNICODE(v);
4676 startinpos = s-starts;
4677 endinpos = startinpos+1;
4678 if (unicode_decode_call_errorhandler(
4679 errors, &errorHandler,
4680 "charmap", "character maps to <undefined>",
4681 &starts, &e, &startinpos, &endinpos, &exc, &s,
4682 &v, &outpos, &p)) {
4683 Py_DECREF(x);
4684 goto onError;
4685 }
4686 Py_DECREF(x);
4687 continue;
4688 }
4689 else if (PyUnicode_Check(x)) {
4690 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004691
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 if (targetsize == 1)
4693 /* 1-1 mapping */
4694 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004695
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 else if (targetsize > 1) {
4697 /* 1-n mapping */
4698 if (targetsize > extrachars) {
4699 /* resize first */
4700 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4701 Py_ssize_t needed = (targetsize - extrachars) + \
4702 (targetsize << 2);
4703 extrachars += needed;
4704 /* XXX overflow detection missing */
4705 if (_PyUnicode_Resize(&v,
4706 PyUnicode_GET_SIZE(v) + needed) < 0) {
4707 Py_DECREF(x);
4708 goto onError;
4709 }
4710 p = PyUnicode_AS_UNICODE(v) + oldpos;
4711 }
4712 Py_UNICODE_COPY(p,
4713 PyUnicode_AS_UNICODE(x),
4714 targetsize);
4715 p += targetsize;
4716 extrachars -= targetsize;
4717 }
4718 /* 1-0 mapping: skip the character */
4719 }
4720 else {
4721 /* wrong return value */
4722 PyErr_SetString(PyExc_TypeError,
4723 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004724 Py_DECREF(x);
4725 goto onError;
4726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 Py_DECREF(x);
4728 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 }
4731 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4733 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 Py_XDECREF(errorHandler);
4735 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004737
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 Py_XDECREF(errorHandler);
4740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 Py_XDECREF(v);
4742 return NULL;
4743}
4744
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004745/* Charmap encoding: the lookup table */
4746
4747struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 PyObject_HEAD
4749 unsigned char level1[32];
4750 int count2, count3;
4751 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004752};
4753
4754static PyObject*
4755encoding_map_size(PyObject *obj, PyObject* args)
4756{
4757 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004758 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004760}
4761
4762static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004763 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 PyDoc_STR("Return the size (in bytes) of this object") },
4765 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004766};
4767
4768static void
4769encoding_map_dealloc(PyObject* o)
4770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004771 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004772}
4773
4774static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004775 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 "EncodingMap", /*tp_name*/
4777 sizeof(struct encoding_map), /*tp_basicsize*/
4778 0, /*tp_itemsize*/
4779 /* methods */
4780 encoding_map_dealloc, /*tp_dealloc*/
4781 0, /*tp_print*/
4782 0, /*tp_getattr*/
4783 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004784 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 0, /*tp_repr*/
4786 0, /*tp_as_number*/
4787 0, /*tp_as_sequence*/
4788 0, /*tp_as_mapping*/
4789 0, /*tp_hash*/
4790 0, /*tp_call*/
4791 0, /*tp_str*/
4792 0, /*tp_getattro*/
4793 0, /*tp_setattro*/
4794 0, /*tp_as_buffer*/
4795 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4796 0, /*tp_doc*/
4797 0, /*tp_traverse*/
4798 0, /*tp_clear*/
4799 0, /*tp_richcompare*/
4800 0, /*tp_weaklistoffset*/
4801 0, /*tp_iter*/
4802 0, /*tp_iternext*/
4803 encoding_map_methods, /*tp_methods*/
4804 0, /*tp_members*/
4805 0, /*tp_getset*/
4806 0, /*tp_base*/
4807 0, /*tp_dict*/
4808 0, /*tp_descr_get*/
4809 0, /*tp_descr_set*/
4810 0, /*tp_dictoffset*/
4811 0, /*tp_init*/
4812 0, /*tp_alloc*/
4813 0, /*tp_new*/
4814 0, /*tp_free*/
4815 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004816};
4817
4818PyObject*
4819PyUnicode_BuildEncodingMap(PyObject* string)
4820{
4821 Py_UNICODE *decode;
4822 PyObject *result;
4823 struct encoding_map *mresult;
4824 int i;
4825 int need_dict = 0;
4826 unsigned char level1[32];
4827 unsigned char level2[512];
4828 unsigned char *mlevel1, *mlevel2, *mlevel3;
4829 int count2 = 0, count3 = 0;
4830
4831 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4832 PyErr_BadArgument();
4833 return NULL;
4834 }
4835 decode = PyUnicode_AS_UNICODE(string);
4836 memset(level1, 0xFF, sizeof level1);
4837 memset(level2, 0xFF, sizeof level2);
4838
4839 /* If there isn't a one-to-one mapping of NULL to \0,
4840 or if there are non-BMP characters, we need to use
4841 a mapping dictionary. */
4842 if (decode[0] != 0)
4843 need_dict = 1;
4844 for (i = 1; i < 256; i++) {
4845 int l1, l2;
4846 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004847#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004848 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004849#endif
4850 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004851 need_dict = 1;
4852 break;
4853 }
4854 if (decode[i] == 0xFFFE)
4855 /* unmapped character */
4856 continue;
4857 l1 = decode[i] >> 11;
4858 l2 = decode[i] >> 7;
4859 if (level1[l1] == 0xFF)
4860 level1[l1] = count2++;
4861 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004862 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004863 }
4864
4865 if (count2 >= 0xFF || count3 >= 0xFF)
4866 need_dict = 1;
4867
4868 if (need_dict) {
4869 PyObject *result = PyDict_New();
4870 PyObject *key, *value;
4871 if (!result)
4872 return NULL;
4873 for (i = 0; i < 256; i++) {
4874 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004875 key = PyLong_FromLong(decode[i]);
4876 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004877 if (!key || !value)
4878 goto failed1;
4879 if (PyDict_SetItem(result, key, value) == -1)
4880 goto failed1;
4881 Py_DECREF(key);
4882 Py_DECREF(value);
4883 }
4884 return result;
4885 failed1:
4886 Py_XDECREF(key);
4887 Py_XDECREF(value);
4888 Py_DECREF(result);
4889 return NULL;
4890 }
4891
4892 /* Create a three-level trie */
4893 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4894 16*count2 + 128*count3 - 1);
4895 if (!result)
4896 return PyErr_NoMemory();
4897 PyObject_Init(result, &EncodingMapType);
4898 mresult = (struct encoding_map*)result;
4899 mresult->count2 = count2;
4900 mresult->count3 = count3;
4901 mlevel1 = mresult->level1;
4902 mlevel2 = mresult->level23;
4903 mlevel3 = mresult->level23 + 16*count2;
4904 memcpy(mlevel1, level1, 32);
4905 memset(mlevel2, 0xFF, 16*count2);
4906 memset(mlevel3, 0, 128*count3);
4907 count3 = 0;
4908 for (i = 1; i < 256; i++) {
4909 int o1, o2, o3, i2, i3;
4910 if (decode[i] == 0xFFFE)
4911 /* unmapped character */
4912 continue;
4913 o1 = decode[i]>>11;
4914 o2 = (decode[i]>>7) & 0xF;
4915 i2 = 16*mlevel1[o1] + o2;
4916 if (mlevel2[i2] == 0xFF)
4917 mlevel2[i2] = count3++;
4918 o3 = decode[i] & 0x7F;
4919 i3 = 128*mlevel2[i2] + o3;
4920 mlevel3[i3] = i;
4921 }
4922 return result;
4923}
4924
4925static int
4926encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4927{
4928 struct encoding_map *map = (struct encoding_map*)mapping;
4929 int l1 = c>>11;
4930 int l2 = (c>>7) & 0xF;
4931 int l3 = c & 0x7F;
4932 int i;
4933
4934#ifdef Py_UNICODE_WIDE
4935 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004937 }
4938#endif
4939 if (c == 0)
4940 return 0;
4941 /* level 1*/
4942 i = map->level1[l1];
4943 if (i == 0xFF) {
4944 return -1;
4945 }
4946 /* level 2*/
4947 i = map->level23[16*i+l2];
4948 if (i == 0xFF) {
4949 return -1;
4950 }
4951 /* level 3 */
4952 i = map->level23[16*map->count2 + 128*i + l3];
4953 if (i == 0) {
4954 return -1;
4955 }
4956 return i;
4957}
4958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959/* Lookup the character ch in the mapping. If the character
4960 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004961 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963{
Christian Heimes217cfd12007-12-02 14:31:20 +00004964 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 PyObject *x;
4966
4967 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 x = PyObject_GetItem(mapping, w);
4970 Py_DECREF(w);
4971 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4973 /* No mapping found means: mapping is undefined. */
4974 PyErr_Clear();
4975 x = Py_None;
4976 Py_INCREF(x);
4977 return x;
4978 } else
4979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004981 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004983 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 long value = PyLong_AS_LONG(x);
4985 if (value < 0 || value > 255) {
4986 PyErr_SetString(PyExc_TypeError,
4987 "character mapping must be in range(256)");
4988 Py_DECREF(x);
4989 return NULL;
4990 }
4991 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004993 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 /* wrong return value */
4997 PyErr_Format(PyExc_TypeError,
4998 "character mapping must return integer, bytes or None, not %.400s",
4999 x->ob_type->tp_name);
5000 Py_DECREF(x);
5001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
5003}
5004
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005005static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005006charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005008 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5009 /* exponentially overallocate to minimize reallocations */
5010 if (requiredsize < 2*outsize)
5011 requiredsize = 2*outsize;
5012 if (_PyBytes_Resize(outobj, requiredsize))
5013 return -1;
5014 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005015}
5016
Benjamin Peterson14339b62009-01-31 16:36:08 +00005017typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005019}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005021 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 space is available. Return a new reference to the object that
5023 was put in the output buffer, or Py_None, if the mapping was undefined
5024 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005025 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005026static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005027charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005030 PyObject *rep;
5031 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005032 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033
Christian Heimes90aa7642007-12-19 02:45:37 +00005034 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005035 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005037 if (res == -1)
5038 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 if (outsize<requiredsize)
5040 if (charmapencode_resize(outobj, outpos, requiredsize))
5041 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005042 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 outstart[(*outpos)++] = (char)res;
5044 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005045 }
5046
5047 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005050 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 Py_DECREF(rep);
5052 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005053 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 if (PyLong_Check(rep)) {
5055 Py_ssize_t requiredsize = *outpos+1;
5056 if (outsize<requiredsize)
5057 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5058 Py_DECREF(rep);
5059 return enc_EXCEPTION;
5060 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005061 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005063 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 else {
5065 const char *repchars = PyBytes_AS_STRING(rep);
5066 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5067 Py_ssize_t requiredsize = *outpos+repsize;
5068 if (outsize<requiredsize)
5069 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5070 Py_DECREF(rep);
5071 return enc_EXCEPTION;
5072 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005073 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 memcpy(outstart + *outpos, repchars, repsize);
5075 *outpos += repsize;
5076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005078 Py_DECREF(rep);
5079 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080}
5081
5082/* handle an error in PyUnicode_EncodeCharmap
5083 Return 0 on success, -1 on error */
5084static
5085int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005086 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005088 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005089 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090{
5091 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t repsize;
5093 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 Py_UNICODE *uni2;
5095 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005096 Py_ssize_t collstartpos = *inpos;
5097 Py_ssize_t collendpos = *inpos+1;
5098 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 char *encoding = "charmap";
5100 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005101 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 /* find all unencodable characters */
5104 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005105 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005106 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 int res = encoding_map_lookup(p[collendpos], mapping);
5108 if (res != -1)
5109 break;
5110 ++collendpos;
5111 continue;
5112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005113
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 rep = charmapencode_lookup(p[collendpos], mapping);
5115 if (rep==NULL)
5116 return -1;
5117 else if (rep!=Py_None) {
5118 Py_DECREF(rep);
5119 break;
5120 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005121 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 }
5124 /* cache callback name lookup
5125 * (if not done yet, i.e. it's the first error) */
5126 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 if ((errors==NULL) || (!strcmp(errors, "strict")))
5128 *known_errorHandler = 1;
5129 else if (!strcmp(errors, "replace"))
5130 *known_errorHandler = 2;
5131 else if (!strcmp(errors, "ignore"))
5132 *known_errorHandler = 3;
5133 else if (!strcmp(errors, "xmlcharrefreplace"))
5134 *known_errorHandler = 4;
5135 else
5136 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 }
5138 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005139 case 1: /* strict */
5140 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5141 return -1;
5142 case 2: /* replace */
5143 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 x = charmapencode_output('?', mapping, res, respos);
5145 if (x==enc_EXCEPTION) {
5146 return -1;
5147 }
5148 else if (x==enc_FAILED) {
5149 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5150 return -1;
5151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005152 }
5153 /* fall through */
5154 case 3: /* ignore */
5155 *inpos = collendpos;
5156 break;
5157 case 4: /* xmlcharrefreplace */
5158 /* generate replacement (temporarily (mis)uses p) */
5159 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 char buffer[2+29+1+1];
5161 char *cp;
5162 sprintf(buffer, "&#%d;", (int)p[collpos]);
5163 for (cp = buffer; *cp; ++cp) {
5164 x = charmapencode_output(*cp, mapping, res, respos);
5165 if (x==enc_EXCEPTION)
5166 return -1;
5167 else if (x==enc_FAILED) {
5168 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5169 return -1;
5170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005171 }
5172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173 *inpos = collendpos;
5174 break;
5175 default:
5176 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 encoding, reason, p, size, exceptionObject,
5178 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005179 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005181 if (PyBytes_Check(repunicode)) {
5182 /* Directly copy bytes result to output. */
5183 Py_ssize_t outsize = PyBytes_Size(*res);
5184 Py_ssize_t requiredsize;
5185 repsize = PyBytes_Size(repunicode);
5186 requiredsize = *respos + repsize;
5187 if (requiredsize > outsize)
5188 /* Make room for all additional bytes. */
5189 if (charmapencode_resize(res, respos, requiredsize)) {
5190 Py_DECREF(repunicode);
5191 return -1;
5192 }
5193 memcpy(PyBytes_AsString(*res) + *respos,
5194 PyBytes_AsString(repunicode), repsize);
5195 *respos += repsize;
5196 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005197 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005198 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005200 /* generate replacement */
5201 repsize = PyUnicode_GET_SIZE(repunicode);
5202 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 x = charmapencode_output(*uni2, mapping, res, respos);
5204 if (x==enc_EXCEPTION) {
5205 return -1;
5206 }
5207 else if (x==enc_FAILED) {
5208 Py_DECREF(repunicode);
5209 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5210 return -1;
5211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005212 }
5213 *inpos = newpos;
5214 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 }
5216 return 0;
5217}
5218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 PyObject *mapping,
5222 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005224 /* output object */
5225 PyObject *res = NULL;
5226 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005227 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 PyObject *errorHandler = NULL;
5231 PyObject *exc = NULL;
5232 /* the following variable is used for caching string comparisons
5233 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5234 * 3=ignore, 4=xmlcharrefreplace */
5235 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236
5237 /* Default to Latin-1 */
5238 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 /* allocate enough for a simple encoding without
5242 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005243 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 if (res == NULL)
5245 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005246 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 /* try to encode it */
5251 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5252 if (x==enc_EXCEPTION) /* error */
5253 goto onError;
5254 if (x==enc_FAILED) { /* unencodable character */
5255 if (charmap_encoding_error(p, size, &inpos, mapping,
5256 &exc,
5257 &known_errorHandler, &errorHandler, errors,
5258 &res, &respos)) {
5259 goto onError;
5260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005261 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 else
5263 /* done with this character => adjust input position */
5264 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005268 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 if (_PyBytes_Resize(&res, respos) < 0)
5270 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005272 Py_XDECREF(exc);
5273 Py_XDECREF(errorHandler);
5274 return res;
5275
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 Py_XDECREF(res);
5278 Py_XDECREF(exc);
5279 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 return NULL;
5281}
5282
5283PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
5286 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 PyErr_BadArgument();
5288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 }
5290 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 PyUnicode_GET_SIZE(unicode),
5292 mapping,
5293 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294}
5295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296/* create or adjust a UnicodeTranslateError */
5297static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 const Py_UNICODE *unicode, Py_ssize_t size,
5299 Py_ssize_t startpos, Py_ssize_t endpos,
5300 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005303 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 }
5306 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5308 goto onError;
5309 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5310 goto onError;
5311 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5312 goto onError;
5313 return;
5314 onError:
5315 Py_DECREF(*exceptionObject);
5316 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 }
5318}
5319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320/* raises a UnicodeTranslateError */
5321static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 const Py_UNICODE *unicode, Py_ssize_t size,
5323 Py_ssize_t startpos, Py_ssize_t endpos,
5324 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325{
5326 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005328 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330}
5331
5332/* error handling callback helper:
5333 build arguments, call the callback and check the arguments,
5334 put the result into newpos and return the replacement string, which
5335 has to be freed by the caller */
5336static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 PyObject **errorHandler,
5338 const char *reason,
5339 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5340 Py_ssize_t startpos, Py_ssize_t endpos,
5341 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005343 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005345 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 PyObject *restuple;
5347 PyObject *resunicode;
5348
5349 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 }
5354
5355 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359
5360 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005365 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 Py_DECREF(restuple);
5367 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 }
5369 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 &resunicode, &i_newpos)) {
5371 Py_DECREF(restuple);
5372 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 else
5377 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005378 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5380 Py_DECREF(restuple);
5381 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 Py_INCREF(resunicode);
5384 Py_DECREF(restuple);
5385 return resunicode;
5386}
5387
5388/* Lookup the character ch in the mapping and put the result in result,
5389 which must be decrefed by the caller.
5390 Return 0 on success, -1 on error */
5391static
5392int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5393{
Christian Heimes217cfd12007-12-02 14:31:20 +00005394 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 PyObject *x;
5396
5397 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 x = PyObject_GetItem(mapping, w);
5400 Py_DECREF(w);
5401 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5403 /* No mapping found means: use 1:1 mapping. */
5404 PyErr_Clear();
5405 *result = NULL;
5406 return 0;
5407 } else
5408 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 }
5410 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 *result = x;
5412 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005414 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 long value = PyLong_AS_LONG(x);
5416 long max = PyUnicode_GetMax();
5417 if (value < 0 || value > max) {
5418 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005419 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_DECREF(x);
5421 return -1;
5422 }
5423 *result = x;
5424 return 0;
5425 }
5426 else if (PyUnicode_Check(x)) {
5427 *result = x;
5428 return 0;
5429 }
5430 else {
5431 /* wrong return value */
5432 PyErr_SetString(PyExc_TypeError,
5433 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005434 Py_DECREF(x);
5435 return -1;
5436 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437}
5438/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 if not reallocate and adjust various state variables.
5440 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441static
Walter Dörwald4894c302003-10-24 14:25:28 +00005442int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005445 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005446 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 /* remember old output position */
5448 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5449 /* exponentially overallocate to minimize reallocations */
5450 if (requiredsize < 2 * oldsize)
5451 requiredsize = 2 * oldsize;
5452 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5453 return -1;
5454 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455 }
5456 return 0;
5457}
5458/* lookup the character, put the result in the output string and adjust
5459 various state variables. Return a new reference to the object that
5460 was put in the output buffer in *result, or Py_None, if the mapping was
5461 undefined (in which case no character was written).
5462 The called must decref result.
5463 Return 0 on success, -1 on error. */
5464static
Walter Dörwald4894c302003-10-24 14:25:28 +00005465int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5467 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468{
Walter Dörwald4894c302003-10-24 14:25:28 +00005469 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 /* not found => default to 1:1 mapping */
5473 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 }
5475 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005477 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 /* no overflow check, because we know that the space is enough */
5479 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 }
5481 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5483 if (repsize==1) {
5484 /* no overflow check, because we know that the space is enough */
5485 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5486 }
5487 else if (repsize!=0) {
5488 /* more than one character */
5489 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5490 (insize - (curinp-startinp)) +
5491 repsize - 1;
5492 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5493 return -1;
5494 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5495 *outp += repsize;
5496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 }
5498 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500 return 0;
5501}
5502
5503PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 Py_ssize_t size,
5505 PyObject *mapping,
5506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 /* output object */
5509 PyObject *res = NULL;
5510 /* pointers to the beginning and end+1 of input */
5511 const Py_UNICODE *startp = p;
5512 const Py_UNICODE *endp = p + size;
5513 /* pointer into the output */
5514 Py_UNICODE *str;
5515 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005516 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 char *reason = "character maps to <undefined>";
5518 PyObject *errorHandler = NULL;
5519 PyObject *exc = NULL;
5520 /* the following variable is used for caching string comparisons
5521 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5522 * 3=ignore, 4=xmlcharrefreplace */
5523 int known_errorHandler = -1;
5524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 PyErr_BadArgument();
5527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529
5530 /* allocate enough for a simple 1:1 translation without
5531 replacements, if we need more, we'll resize */
5532 res = PyUnicode_FromUnicode(NULL, size);
5533 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 /* try to encode it */
5541 PyObject *x = NULL;
5542 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5543 Py_XDECREF(x);
5544 goto onError;
5545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005546 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 if (x!=Py_None) /* it worked => adjust input pointer */
5548 ++p;
5549 else { /* untranslatable character */
5550 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5551 Py_ssize_t repsize;
5552 Py_ssize_t newpos;
5553 Py_UNICODE *uni2;
5554 /* startpos for collecting untranslatable chars */
5555 const Py_UNICODE *collstart = p;
5556 const Py_UNICODE *collend = p+1;
5557 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 /* find all untranslatable characters */
5560 while (collend < endp) {
5561 if (charmaptranslate_lookup(*collend, mapping, &x))
5562 goto onError;
5563 Py_XDECREF(x);
5564 if (x!=Py_None)
5565 break;
5566 ++collend;
5567 }
5568 /* cache callback name lookup
5569 * (if not done yet, i.e. it's the first error) */
5570 if (known_errorHandler==-1) {
5571 if ((errors==NULL) || (!strcmp(errors, "strict")))
5572 known_errorHandler = 1;
5573 else if (!strcmp(errors, "replace"))
5574 known_errorHandler = 2;
5575 else if (!strcmp(errors, "ignore"))
5576 known_errorHandler = 3;
5577 else if (!strcmp(errors, "xmlcharrefreplace"))
5578 known_errorHandler = 4;
5579 else
5580 known_errorHandler = 0;
5581 }
5582 switch (known_errorHandler) {
5583 case 1: /* strict */
5584 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005585 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 case 2: /* replace */
5587 /* No need to check for space, this is a 1:1 replacement */
5588 for (coll = collstart; coll<collend; ++coll)
5589 *str++ = '?';
5590 /* fall through */
5591 case 3: /* ignore */
5592 p = collend;
5593 break;
5594 case 4: /* xmlcharrefreplace */
5595 /* generate replacement (temporarily (mis)uses p) */
5596 for (p = collstart; p < collend; ++p) {
5597 char buffer[2+29+1+1];
5598 char *cp;
5599 sprintf(buffer, "&#%d;", (int)*p);
5600 if (charmaptranslate_makespace(&res, &str,
5601 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5602 goto onError;
5603 for (cp = buffer; *cp; ++cp)
5604 *str++ = *cp;
5605 }
5606 p = collend;
5607 break;
5608 default:
5609 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5610 reason, startp, size, &exc,
5611 collstart-startp, collend-startp, &newpos);
5612 if (repunicode == NULL)
5613 goto onError;
5614 /* generate replacement */
5615 repsize = PyUnicode_GET_SIZE(repunicode);
5616 if (charmaptranslate_makespace(&res, &str,
5617 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5618 Py_DECREF(repunicode);
5619 goto onError;
5620 }
5621 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5622 *str++ = *uni2;
5623 p = startp + newpos;
5624 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005626 }
5627 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 /* Resize if we allocated to much */
5629 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005630 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 if (PyUnicode_Resize(&res, respos) < 0)
5632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 }
5634 Py_XDECREF(exc);
5635 Py_XDECREF(errorHandler);
5636 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 Py_XDECREF(res);
5640 Py_XDECREF(exc);
5641 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 return NULL;
5643}
5644
5645PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 PyObject *mapping,
5647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648{
5649 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005650
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 str = PyUnicode_FromObject(str);
5652 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 PyUnicode_GET_SIZE(str),
5656 mapping,
5657 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 Py_DECREF(str);
5659 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005660
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 Py_XDECREF(str);
5663 return NULL;
5664}
Tim Petersced69f82003-09-16 20:30:58 +00005665
Guido van Rossum9e896b32000-04-05 20:11:21 +00005666/* --- Decimal Encoder ---------------------------------------------------- */
5667
5668int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 Py_ssize_t length,
5670 char *output,
5671 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005672{
5673 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 PyObject *errorHandler = NULL;
5675 PyObject *exc = NULL;
5676 const char *encoding = "decimal";
5677 const char *reason = "invalid decimal Unicode string";
5678 /* the following variable is used for caching string comparisons
5679 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5680 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005681
5682 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 PyErr_BadArgument();
5684 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005685 }
5686
5687 p = s;
5688 end = s + length;
5689 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 register Py_UNICODE ch = *p;
5691 int decimal;
5692 PyObject *repunicode;
5693 Py_ssize_t repsize;
5694 Py_ssize_t newpos;
5695 Py_UNICODE *uni2;
5696 Py_UNICODE *collstart;
5697 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005700 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 ++p;
5702 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 decimal = Py_UNICODE_TODECIMAL(ch);
5705 if (decimal >= 0) {
5706 *output++ = '0' + decimal;
5707 ++p;
5708 continue;
5709 }
5710 if (0 < ch && ch < 256) {
5711 *output++ = (char)ch;
5712 ++p;
5713 continue;
5714 }
5715 /* All other characters are considered unencodable */
5716 collstart = p;
5717 collend = p+1;
5718 while (collend < end) {
5719 if ((0 < *collend && *collend < 256) ||
5720 !Py_UNICODE_ISSPACE(*collend) ||
5721 Py_UNICODE_TODECIMAL(*collend))
5722 break;
5723 }
5724 /* cache callback name lookup
5725 * (if not done yet, i.e. it's the first error) */
5726 if (known_errorHandler==-1) {
5727 if ((errors==NULL) || (!strcmp(errors, "strict")))
5728 known_errorHandler = 1;
5729 else if (!strcmp(errors, "replace"))
5730 known_errorHandler = 2;
5731 else if (!strcmp(errors, "ignore"))
5732 known_errorHandler = 3;
5733 else if (!strcmp(errors, "xmlcharrefreplace"))
5734 known_errorHandler = 4;
5735 else
5736 known_errorHandler = 0;
5737 }
5738 switch (known_errorHandler) {
5739 case 1: /* strict */
5740 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5741 goto onError;
5742 case 2: /* replace */
5743 for (p = collstart; p < collend; ++p)
5744 *output++ = '?';
5745 /* fall through */
5746 case 3: /* ignore */
5747 p = collend;
5748 break;
5749 case 4: /* xmlcharrefreplace */
5750 /* generate replacement (temporarily (mis)uses p) */
5751 for (p = collstart; p < collend; ++p)
5752 output += sprintf(output, "&#%d;", (int)*p);
5753 p = collend;
5754 break;
5755 default:
5756 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5757 encoding, reason, s, length, &exc,
5758 collstart-s, collend-s, &newpos);
5759 if (repunicode == NULL)
5760 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005761 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005762 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005763 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5764 Py_DECREF(repunicode);
5765 goto onError;
5766 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 /* generate replacement */
5768 repsize = PyUnicode_GET_SIZE(repunicode);
5769 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5770 Py_UNICODE ch = *uni2;
5771 if (Py_UNICODE_ISSPACE(ch))
5772 *output++ = ' ';
5773 else {
5774 decimal = Py_UNICODE_TODECIMAL(ch);
5775 if (decimal >= 0)
5776 *output++ = '0' + decimal;
5777 else if (0 < ch && ch < 256)
5778 *output++ = (char)ch;
5779 else {
5780 Py_DECREF(repunicode);
5781 raise_encode_exception(&exc, encoding,
5782 s, length, collstart-s, collend-s, reason);
5783 goto onError;
5784 }
5785 }
5786 }
5787 p = s + newpos;
5788 Py_DECREF(repunicode);
5789 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005790 }
5791 /* 0-terminate the output string */
5792 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 Py_XDECREF(exc);
5794 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005795 return 0;
5796
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 Py_XDECREF(exc);
5799 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005800 return -1;
5801}
5802
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803/* --- Helpers ------------------------------------------------------------ */
5804
Eric Smith8c663262007-08-25 02:26:07 +00005805#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005806#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005807#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005808/* Include _ParseTupleFinds from find.h */
5809#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810#include "stringlib/find.h"
5811#include "stringlib/partition.h"
5812
Eric Smith5807c412008-05-11 21:00:57 +00005813#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005814#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005815#include "stringlib/localeutil.h"
5816
Thomas Wouters477c8d52006-05-27 19:21:47 +00005817/* helper macro to fixup start/end slice values */
5818#define FIX_START_END(obj) \
5819 if (start < 0) \
5820 start += (obj)->length; \
5821 if (start < 0) \
5822 start = 0; \
5823 if (end > (obj)->length) \
5824 end = (obj)->length; \
5825 if (end < 0) \
5826 end += (obj)->length; \
5827 if (end < 0) \
5828 end = 0;
5829
Martin v. Löwis18e16552006-02-15 17:27:45 +00005830Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005831 PyObject *substr,
5832 Py_ssize_t start,
5833 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836 PyUnicodeObject* str_obj;
5837 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005838
Thomas Wouters477c8d52006-05-27 19:21:47 +00005839 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5840 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5843 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 Py_DECREF(str_obj);
5845 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 }
Tim Petersced69f82003-09-16 20:30:58 +00005847
Thomas Wouters477c8d52006-05-27 19:21:47 +00005848 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005849
Thomas Wouters477c8d52006-05-27 19:21:47 +00005850 result = stringlib_count(
5851 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5852 );
5853
5854 Py_DECREF(sub_obj);
5855 Py_DECREF(str_obj);
5856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return result;
5858}
5859
Martin v. Löwis18e16552006-02-15 17:27:45 +00005860Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005861 PyObject *sub,
5862 Py_ssize_t start,
5863 Py_ssize_t end,
5864 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 sub = PyUnicode_FromObject(sub);
5872 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 Py_DECREF(str);
5874 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 }
Tim Petersced69f82003-09-16 20:30:58 +00005876
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 if (direction > 0)
5878 result = stringlib_find_slice(
5879 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5880 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5881 start, end
5882 );
5883 else
5884 result = stringlib_rfind_slice(
5885 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5886 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5887 start, end
5888 );
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005891 Py_DECREF(sub);
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 return result;
5894}
5895
Tim Petersced69f82003-09-16 20:30:58 +00005896static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 PyUnicodeObject *substring,
5899 Py_ssize_t start,
5900 Py_ssize_t end,
5901 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (substring->length == 0)
5904 return 1;
5905
Thomas Wouters477c8d52006-05-27 19:21:47 +00005906 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
5908 end -= substring->length;
5909 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
5912 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 if (Py_UNICODE_MATCH(self, end, substring))
5914 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 } else {
5916 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 }
5919
5920 return 0;
5921}
5922
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 PyObject *substr,
5925 Py_ssize_t start,
5926 Py_ssize_t end,
5927 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005929 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 str = PyUnicode_FromObject(str);
5932 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 substr = PyUnicode_FromObject(substr);
5935 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 Py_DECREF(str);
5937 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 }
Tim Petersced69f82003-09-16 20:30:58 +00005939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 (PyUnicodeObject *)substr,
5942 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 Py_DECREF(str);
5944 Py_DECREF(substr);
5945 return result;
5946}
5947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948/* Apply fixfct filter to the Unicode object self and return a
5949 reference to the modified object */
5950
Tim Petersced69f82003-09-16 20:30:58 +00005951static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
5955
5956 PyUnicodeObject *u;
5957
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005958 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005961
5962 Py_UNICODE_COPY(u->str, self->str, self->length);
5963
Tim Peters7a29bd52001-09-12 03:03:31 +00005964 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 /* fixfct should return TRUE if it modified the buffer. If
5966 FALSE, return a reference to the original buffer instead
5967 (to save space, not time) */
5968 Py_INCREF(self);
5969 Py_DECREF(u);
5970 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
5972 return (PyObject*) u;
5973}
5974
Tim Petersced69f82003-09-16 20:30:58 +00005975static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976int fixupper(PyUnicodeObject *self)
5977{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 Py_UNICODE *s = self->str;
5980 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 ch = Py_UNICODE_TOUPPER(*s);
5986 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 *s = ch;
5989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 s++;
5991 }
5992
5993 return status;
5994}
5995
Tim Petersced69f82003-09-16 20:30:58 +00005996static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997int fixlower(PyUnicodeObject *self)
5998{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005999 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 Py_UNICODE *s = self->str;
6001 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006002
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006005
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 ch = Py_UNICODE_TOLOWER(*s);
6007 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 *s = ch;
6010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 s++;
6012 }
6013
6014 return status;
6015}
6016
Tim Petersced69f82003-09-16 20:30:58 +00006017static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018int fixswapcase(PyUnicodeObject *self)
6019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006020 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 Py_UNICODE *s = self->str;
6022 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 while (len-- > 0) {
6025 if (Py_UNICODE_ISUPPER(*s)) {
6026 *s = Py_UNICODE_TOLOWER(*s);
6027 status = 1;
6028 } else if (Py_UNICODE_ISLOWER(*s)) {
6029 *s = Py_UNICODE_TOUPPER(*s);
6030 status = 1;
6031 }
6032 s++;
6033 }
6034
6035 return status;
6036}
6037
Tim Petersced69f82003-09-16 20:30:58 +00006038static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039int fixcapitalize(PyUnicodeObject *self)
6040{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006042 Py_UNICODE *s = self->str;
6043 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006044
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006045 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006047 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 *s = Py_UNICODE_TOUPPER(*s);
6049 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006051 s++;
6052 while (--len > 0) {
6053 if (Py_UNICODE_ISUPPER(*s)) {
6054 *s = Py_UNICODE_TOLOWER(*s);
6055 status = 1;
6056 }
6057 s++;
6058 }
6059 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060}
6061
6062static
6063int fixtitle(PyUnicodeObject *self)
6064{
6065 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6066 register Py_UNICODE *e;
6067 int previous_is_cased;
6068
6069 /* Shortcut for single character strings */
6070 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6072 if (*p != ch) {
6073 *p = ch;
6074 return 1;
6075 }
6076 else
6077 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
Tim Petersced69f82003-09-16 20:30:58 +00006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 e = p + PyUnicode_GET_SIZE(self);
6081 previous_is_cased = 0;
6082 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006084
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 if (previous_is_cased)
6086 *p = Py_UNICODE_TOLOWER(ch);
6087 else
6088 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006089
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 if (Py_UNICODE_ISLOWER(ch) ||
6091 Py_UNICODE_ISUPPER(ch) ||
6092 Py_UNICODE_ISTITLE(ch))
6093 previous_is_cased = 1;
6094 else
6095 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
6097 return 1;
6098}
6099
Tim Peters8ce9f162004-08-27 01:49:32 +00006100PyObject *
6101PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102{
Skip Montanaro6543b452004-09-16 03:28:13 +00006103 const Py_UNICODE blank = ' ';
6104 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006105 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006106 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006107 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6108 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006109 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6110 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006111 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006112 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
Tim Peters05eba1f2004-08-27 21:32:02 +00006114 fseq = PySequence_Fast(seq, "");
6115 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006116 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006117 }
6118
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006119 /* NOTE: the following code can't call back into Python code,
6120 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006121 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006122
Tim Peters05eba1f2004-08-27 21:32:02 +00006123 seqlen = PySequence_Fast_GET_SIZE(fseq);
6124 /* If empty sequence, return u"". */
6125 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006126 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6127 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006128 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006129 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006130 /* If singleton sequence with an exact Unicode, return that. */
6131 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 item = items[0];
6133 if (PyUnicode_CheckExact(item)) {
6134 Py_INCREF(item);
6135 res = (PyUnicodeObject *)item;
6136 goto Done;
6137 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006138 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006139 else {
6140 /* Set up sep and seplen */
6141 if (separator == NULL) {
6142 sep = &blank;
6143 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006144 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006145 else {
6146 if (!PyUnicode_Check(separator)) {
6147 PyErr_Format(PyExc_TypeError,
6148 "separator: expected str instance,"
6149 " %.80s found",
6150 Py_TYPE(separator)->tp_name);
6151 goto onError;
6152 }
6153 sep = PyUnicode_AS_UNICODE(separator);
6154 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006155 }
6156 }
6157
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006158 /* There are at least two things to join, or else we have a subclass
6159 * of str in the sequence.
6160 * Do a pre-pass to figure out the total amount of space we'll
6161 * need (sz), and see whether all argument are strings.
6162 */
6163 sz = 0;
6164 for (i = 0; i < seqlen; i++) {
6165 const Py_ssize_t old_sz = sz;
6166 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 if (!PyUnicode_Check(item)) {
6168 PyErr_Format(PyExc_TypeError,
6169 "sequence item %zd: expected str instance,"
6170 " %.80s found",
6171 i, Py_TYPE(item)->tp_name);
6172 goto onError;
6173 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006174 sz += PyUnicode_GET_SIZE(item);
6175 if (i != 0)
6176 sz += seplen;
6177 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6178 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006180 goto onError;
6181 }
6182 }
Tim Petersced69f82003-09-16 20:30:58 +00006183
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006184 res = _PyUnicode_New(sz);
6185 if (res == NULL)
6186 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006187
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006188 /* Catenate everything. */
6189 res_p = PyUnicode_AS_UNICODE(res);
6190 for (i = 0; i < seqlen; ++i) {
6191 Py_ssize_t itemlen;
6192 item = items[i];
6193 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 /* Copy item, and maybe the separator. */
6195 if (i) {
6196 Py_UNICODE_COPY(res_p, sep, seplen);
6197 res_p += seplen;
6198 }
6199 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6200 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006201 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006204 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 return (PyObject *)res;
6206
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006208 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006209 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return NULL;
6211}
6212
Tim Petersced69f82003-09-16 20:30:58 +00006213static
6214PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 Py_ssize_t left,
6216 Py_ssize_t right,
6217 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
6219 PyUnicodeObject *u;
6220
6221 if (left < 0)
6222 left = 0;
6223 if (right < 0)
6224 right = 0;
6225
Tim Peters7a29bd52001-09-12 03:03:31 +00006226 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 Py_INCREF(self);
6228 return self;
6229 }
6230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006231 if (left > PY_SSIZE_T_MAX - self->length ||
6232 right > PY_SSIZE_T_MAX - (left + self->length)) {
6233 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6234 return NULL;
6235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 u = _PyUnicode_New(left + self->length + right);
6237 if (u) {
6238 if (left)
6239 Py_UNICODE_FILL(u->str, fill, left);
6240 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6241 if (right)
6242 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6243 }
6244
6245 return u;
6246}
6247
Benjamin Peterson29060642009-01-31 22:14:21 +00006248#define SPLIT_APPEND(data, left, right) \
6249 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6250 if (!str) \
6251 goto onError; \
6252 if (PyList_Append(list, str)) { \
6253 Py_DECREF(str); \
6254 goto onError; \
6255 } \
6256 else \
6257 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
6259static
6260PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 PyObject *list,
6262 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006264 register Py_ssize_t i;
6265 register Py_ssize_t j;
6266 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006268 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
6270 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006272 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6276 i++;
6277 if (j < i) {
6278 if (maxcount-- <= 0)
6279 break;
6280 SPLIT_APPEND(buf, j, i);
6281 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6282 i++;
6283 j = i;
6284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 }
6286 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 }
6289 return list;
6290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 Py_DECREF(list);
6293 return NULL;
6294}
6295
6296PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006299 register Py_ssize_t i;
6300 register Py_ssize_t j;
6301 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 PyObject *list;
6303 PyObject *str;
6304 Py_UNICODE *data;
6305
6306 string = PyUnicode_FromObject(string);
6307 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 data = PyUnicode_AS_UNICODE(string);
6310 len = PyUnicode_GET_SIZE(string);
6311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 list = PyList_New(0);
6313 if (!list)
6314 goto onError;
6315
6316 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Find a line and append it */
6320 while (i < len && !BLOOM_LINEBREAK(data[i]))
6321 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 if (i < len) {
6326 if (data[i] == '\r' && i + 1 < len &&
6327 data[i+1] == '\n')
6328 i += 2;
6329 else
6330 i++;
6331 if (keepends)
6332 eol = i;
6333 }
6334 SPLIT_APPEND(data, j, eol);
6335 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 }
6337 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
6340
6341 Py_DECREF(string);
6342 return list;
6343
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006345 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 Py_DECREF(string);
6347 return NULL;
6348}
6349
Tim Petersced69f82003-09-16 20:30:58 +00006350static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 PyObject *list,
6353 Py_UNICODE ch,
6354 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 register Py_ssize_t i;
6357 register Py_ssize_t j;
6358 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006360 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
6362 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 if (buf[i] == ch) {
6364 if (maxcount-- <= 0)
6365 break;
6366 SPLIT_APPEND(buf, j, i);
6367 i = j = i + 1;
6368 } else
6369 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
6374 return list;
6375
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 Py_DECREF(list);
6378 return NULL;
6379}
6380
Tim Petersced69f82003-09-16 20:30:58 +00006381static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 PyObject *list,
6384 PyUnicodeObject *substring,
6385 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006387 register Py_ssize_t i;
6388 register Py_ssize_t j;
6389 Py_ssize_t len = self->length;
6390 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 PyObject *str;
6392
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006393 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 if (Py_UNICODE_MATCH(self, i, substring)) {
6395 if (maxcount-- <= 0)
6396 break;
6397 SPLIT_APPEND(self->str, j, i);
6398 i = j = i + sublen;
6399 } else
6400 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 }
6402 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 }
6405 return list;
6406
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 Py_DECREF(list);
6409 return NULL;
6410}
6411
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006412static
6413PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 PyObject *list,
6415 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006416{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006417 register Py_ssize_t i;
6418 register Py_ssize_t j;
6419 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006420 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006421 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422
6423 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006425 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006427 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6429 i--;
6430 if (j > i) {
6431 if (maxcount-- <= 0)
6432 break;
6433 SPLIT_APPEND(buf, i + 1, j + 1);
6434 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6435 i--;
6436 j = i;
6437 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006438 }
6439 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006441 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442 if (PyList_Reverse(list) < 0)
6443 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006444 return list;
6445
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006447 Py_DECREF(list);
6448 return NULL;
6449}
6450
Benjamin Peterson14339b62009-01-31 16:36:08 +00006451static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006452PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 PyObject *list,
6454 Py_UNICODE ch,
6455 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 register Py_ssize_t i;
6458 register Py_ssize_t j;
6459 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006460 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006461 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006462
6463 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 if (buf[i] == ch) {
6465 if (maxcount-- <= 0)
6466 break;
6467 SPLIT_APPEND(buf, i + 1, j + 1);
6468 j = i = i - 1;
6469 } else
6470 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006471 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006472 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006474 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475 if (PyList_Reverse(list) < 0)
6476 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006477 return list;
6478
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006480 Py_DECREF(list);
6481 return NULL;
6482}
6483
Benjamin Peterson14339b62009-01-31 16:36:08 +00006484static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006485PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyObject *list,
6487 PyUnicodeObject *substring,
6488 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006489{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006490 register Py_ssize_t i;
6491 register Py_ssize_t j;
6492 Py_ssize_t len = self->length;
6493 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006494 PyObject *str;
6495
6496 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 if (Py_UNICODE_MATCH(self, i, substring)) {
6498 if (maxcount-- <= 0)
6499 break;
6500 SPLIT_APPEND(self->str, i + sublen, j);
6501 j = i;
6502 i -= sublen;
6503 } else
6504 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006505 }
6506 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006508 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006509 if (PyList_Reverse(list) < 0)
6510 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006511 return list;
6512
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006514 Py_DECREF(list);
6515 return NULL;
6516}
6517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518#undef SPLIT_APPEND
6519
6520static
6521PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 PyUnicodeObject *substring,
6523 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524{
6525 PyObject *list;
6526
6527 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006528 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
6530 list = PyList_New(0);
6531 if (!list)
6532 return NULL;
6533
6534 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
6537 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
6540 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 Py_DECREF(list);
6542 PyErr_SetString(PyExc_ValueError, "empty separator");
6543 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 }
6545 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547}
6548
Tim Petersced69f82003-09-16 20:30:58 +00006549static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006550PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 PyUnicodeObject *substring,
6552 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006553{
6554 PyObject *list;
6555
6556 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006557 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006558
6559 list = PyList_New(0);
6560 if (!list)
6561 return NULL;
6562
6563 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006565
6566 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006568
6569 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 Py_DECREF(list);
6571 PyErr_SetString(PyExc_ValueError, "empty separator");
6572 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006573 }
6574 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006576}
6577
6578static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 PyUnicodeObject *str1,
6581 PyUnicodeObject *str2,
6582 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
6584 PyUnicodeObject *u;
6585
6586 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
Thomas Wouters477c8d52006-05-27 19:21:47 +00006589 if (str1->length == str2->length) {
6590 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006591 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 if (str1->length == 1) {
6593 /* replace characters */
6594 Py_UNICODE u1, u2;
6595 if (!findchar(self->str, self->length, str1->str[0]))
6596 goto nothing;
6597 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6598 if (!u)
6599 return NULL;
6600 Py_UNICODE_COPY(u->str, self->str, self->length);
6601 u1 = str1->str[0];
6602 u2 = str2->str[0];
6603 for (i = 0; i < u->length; i++)
6604 if (u->str[i] == u1) {
6605 if (--maxcount < 0)
6606 break;
6607 u->str[i] = u2;
6608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006610 i = fastsearch(
6611 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006613 if (i < 0)
6614 goto nothing;
6615 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6616 if (!u)
6617 return NULL;
6618 Py_UNICODE_COPY(u->str, self->str, self->length);
6619 while (i <= self->length - str1->length)
6620 if (Py_UNICODE_MATCH(self, i, str1)) {
6621 if (--maxcount < 0)
6622 break;
6623 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6624 i += str1->length;
6625 } else
6626 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006629
6630 Py_ssize_t n, i, j, e;
6631 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 Py_UNICODE *p;
6633
6634 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 if (n > maxcount)
6637 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638 if (n == 0)
6639 goto nothing;
6640 /* new_size = self->length + n * (str2->length - str1->length)); */
6641 delta = (str2->length - str1->length);
6642 if (delta == 0) {
6643 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006645 product = n * (str2->length - str1->length);
6646 if ((product / (str2->length - str1->length)) != n) {
6647 PyErr_SetString(PyExc_OverflowError,
6648 "replace string is too long");
6649 return NULL;
6650 }
6651 new_size = self->length + product;
6652 if (new_size < 0) {
6653 PyErr_SetString(PyExc_OverflowError,
6654 "replace string is too long");
6655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 }
6657 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006658 u = _PyUnicode_New(new_size);
6659 if (!u)
6660 return NULL;
6661 i = 0;
6662 p = u->str;
6663 e = self->length - str1->length;
6664 if (str1->length > 0) {
6665 while (n-- > 0) {
6666 /* look for next match */
6667 j = i;
6668 while (j <= e) {
6669 if (Py_UNICODE_MATCH(self, j, str1))
6670 break;
6671 j++;
6672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006674 if (j > e)
6675 break;
6676 /* copy unchanged part [i:j] */
6677 Py_UNICODE_COPY(p, self->str+i, j-i);
6678 p += j - i;
6679 }
6680 /* copy substitution string */
6681 if (str2->length > 0) {
6682 Py_UNICODE_COPY(p, str2->str, str2->length);
6683 p += str2->length;
6684 }
6685 i = j + str1->length;
6686 }
6687 if (i < self->length)
6688 /* copy tail [i:] */
6689 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6690 } else {
6691 /* interleave */
6692 while (n > 0) {
6693 Py_UNICODE_COPY(p, str2->str, str2->length);
6694 p += str2->length;
6695 if (--n <= 0)
6696 break;
6697 *p++ = self->str[i++];
6698 }
6699 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006703
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705 /* nothing to replace; return original string (when possible) */
6706 if (PyUnicode_CheckExact(self)) {
6707 Py_INCREF(self);
6708 return (PyObject *) self;
6709 }
6710 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
6713/* --- Unicode Object Methods --------------------------------------------- */
6714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006715PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717\n\
6718Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
6721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006722unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 return fixup(self, fixtitle);
6725}
6726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006727PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729\n\
6730Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006734unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 return fixup(self, fixcapitalize);
6737}
6738
6739#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742\n\
6743Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006744normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
6746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006747unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
6749 PyObject *list;
6750 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006751 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 /* Split into words */
6754 list = split(self, NULL, -1);
6755 if (!list)
6756 return NULL;
6757
6758 /* Capitalize each word */
6759 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6760 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 if (item == NULL)
6763 goto onError;
6764 Py_DECREF(PyList_GET_ITEM(list, i));
6765 PyList_SET_ITEM(list, i, item);
6766 }
6767
6768 /* Join the words to form a new string */
6769 item = PyUnicode_Join(NULL, list);
6770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 Py_DECREF(list);
6773 return (PyObject *)item;
6774}
6775#endif
6776
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006777/* Argument converter. Coerces to a single unicode character */
6778
6779static int
6780convert_uc(PyObject *obj, void *addr)
6781{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006782 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6783 PyObject *uniobj;
6784 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006785
Benjamin Peterson14339b62009-01-31 16:36:08 +00006786 uniobj = PyUnicode_FromObject(obj);
6787 if (uniobj == NULL) {
6788 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 return 0;
6791 }
6792 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6793 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006795 Py_DECREF(uniobj);
6796 return 0;
6797 }
6798 unistr = PyUnicode_AS_UNICODE(uniobj);
6799 *fillcharloc = unistr[0];
6800 Py_DECREF(uniobj);
6801 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006802}
6803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006807Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006808done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
6810static PyObject *
6811unicode_center(PyUnicodeObject *self, PyObject *args)
6812{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006813 Py_ssize_t marg, left;
6814 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006815 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
Thomas Woutersde017742006-02-16 19:34:37 +00006817 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 return NULL;
6819
Tim Peters7a29bd52001-09-12 03:03:31 +00006820 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 Py_INCREF(self);
6822 return (PyObject*) self;
6823 }
6824
6825 marg = width - self->length;
6826 left = marg / 2 + (marg & width & 1);
6827
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006828 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
6830
Marc-André Lemburge5034372000-08-08 08:04:29 +00006831#if 0
6832
6833/* This code should go into some future Unicode collation support
6834 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006835 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006836
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006837/* speedy UTF-16 code point order comparison */
6838/* gleaned from: */
6839/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6840
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006841static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006842{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006843 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006844 0, 0, 0, 0, 0, 0, 0, 0,
6845 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006846 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847};
6848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849static int
6850unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006852 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 Py_UNICODE *s1 = str1->str;
6855 Py_UNICODE *s2 = str2->str;
6856
6857 len1 = str1->length;
6858 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006861 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006862
6863 c1 = *s1++;
6864 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006865
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 if (c1 > (1<<11) * 26)
6867 c1 += utf16Fixup[c1>>11];
6868 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006869 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006870 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006871
6872 if (c1 != c2)
6873 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006874
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006875 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 }
6877
6878 return (len1 < len2) ? -1 : (len1 != len2);
6879}
6880
Marc-André Lemburge5034372000-08-08 08:04:29 +00006881#else
6882
6883static int
6884unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006886 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006887
6888 Py_UNICODE *s1 = str1->str;
6889 Py_UNICODE *s2 = str2->str;
6890
6891 len1 = str1->length;
6892 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006893
Marc-André Lemburge5034372000-08-08 08:04:29 +00006894 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006895 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006896
Fredrik Lundh45714e92001-06-26 16:39:36 +00006897 c1 = *s1++;
6898 c2 = *s2++;
6899
6900 if (c1 != c2)
6901 return (c1 < c2) ? -1 : 1;
6902
Marc-André Lemburge5034372000-08-08 08:04:29 +00006903 len1--; len2--;
6904 }
6905
6906 return (len1 < len2) ? -1 : (len1 != len2);
6907}
6908
6909#endif
6910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006914 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6915 return unicode_compare((PyUnicodeObject *)left,
6916 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006917 PyErr_Format(PyExc_TypeError,
6918 "Can't compare %.100s and %.100s",
6919 left->ob_type->tp_name,
6920 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 return -1;
6922}
6923
Martin v. Löwis5b222132007-06-10 09:51:05 +00006924int
6925PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6926{
6927 int i;
6928 Py_UNICODE *id;
6929 assert(PyUnicode_Check(uni));
6930 id = PyUnicode_AS_UNICODE(uni);
6931 /* Compare Unicode string and source character set string */
6932 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 if (id[i] != str[i])
6934 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006935 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006937 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006939 return 0;
6940}
6941
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006942
Benjamin Peterson29060642009-01-31 22:14:21 +00006943#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006944 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006945
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006946PyObject *PyUnicode_RichCompare(PyObject *left,
6947 PyObject *right,
6948 int op)
6949{
6950 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006951
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006952 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6953 PyObject *v;
6954 if (((PyUnicodeObject *) left)->length !=
6955 ((PyUnicodeObject *) right)->length) {
6956 if (op == Py_EQ) {
6957 Py_INCREF(Py_False);
6958 return Py_False;
6959 }
6960 if (op == Py_NE) {
6961 Py_INCREF(Py_True);
6962 return Py_True;
6963 }
6964 }
6965 if (left == right)
6966 result = 0;
6967 else
6968 result = unicode_compare((PyUnicodeObject *)left,
6969 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006970
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006971 /* Convert the return value to a Boolean */
6972 switch (op) {
6973 case Py_EQ:
6974 v = TEST_COND(result == 0);
6975 break;
6976 case Py_NE:
6977 v = TEST_COND(result != 0);
6978 break;
6979 case Py_LE:
6980 v = TEST_COND(result <= 0);
6981 break;
6982 case Py_GE:
6983 v = TEST_COND(result >= 0);
6984 break;
6985 case Py_LT:
6986 v = TEST_COND(result == -1);
6987 break;
6988 case Py_GT:
6989 v = TEST_COND(result == 1);
6990 break;
6991 default:
6992 PyErr_BadArgument();
6993 return NULL;
6994 }
6995 Py_INCREF(v);
6996 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006997 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006998
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006999 Py_INCREF(Py_NotImplemented);
7000 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007001}
7002
Guido van Rossum403d68b2000-03-13 15:55:09 +00007003int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007005{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007007 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007008
7009 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007010 sub = PyUnicode_FromObject(element);
7011 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 PyErr_Format(PyExc_TypeError,
7013 "'in <string>' requires string as left operand, not %s",
7014 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007015 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007016 }
7017
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 str = PyUnicode_FromObject(container);
7019 if (!str) {
7020 Py_DECREF(sub);
7021 return -1;
7022 }
7023
7024 result = stringlib_contains_obj(str, sub);
7025
7026 Py_DECREF(str);
7027 Py_DECREF(sub);
7028
Guido van Rossum403d68b2000-03-13 15:55:09 +00007029 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007030}
7031
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032/* Concat to string or Unicode object giving a new Unicode object. */
7033
7034PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036{
7037 PyUnicodeObject *u = NULL, *v = NULL, *w;
7038
7039 /* Coerce the two arguments */
7040 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7041 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7044 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047 /* Shortcuts */
7048 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 Py_DECREF(v);
7050 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
7052 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 Py_DECREF(u);
7054 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 }
7056
7057 /* Concat the two Unicode strings */
7058 w = _PyUnicode_New(u->length + v->length);
7059 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 Py_UNICODE_COPY(w->str, u->str, u->length);
7062 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7063
7064 Py_DECREF(u);
7065 Py_DECREF(v);
7066 return (PyObject *)w;
7067
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 Py_XDECREF(u);
7070 Py_XDECREF(v);
7071 return NULL;
7072}
7073
Walter Dörwald1ab83302007-05-18 17:15:44 +00007074void
7075PyUnicode_Append(PyObject **pleft, PyObject *right)
7076{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007077 PyObject *new;
7078 if (*pleft == NULL)
7079 return;
7080 if (right == NULL || !PyUnicode_Check(*pleft)) {
7081 Py_DECREF(*pleft);
7082 *pleft = NULL;
7083 return;
7084 }
7085 new = PyUnicode_Concat(*pleft, right);
7086 Py_DECREF(*pleft);
7087 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007088}
7089
7090void
7091PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7092{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 PyUnicode_Append(pleft, right);
7094 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007095}
7096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007100Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007101string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103
7104static PyObject *
7105unicode_count(PyUnicodeObject *self, PyObject *args)
7106{
7107 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007109 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 PyObject *result;
7111
Guido van Rossumb8872e62000-05-09 14:14:27 +00007112 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 return NULL;
7115
7116 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007117 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007120
Thomas Wouters477c8d52006-05-27 19:21:47 +00007121 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122
Christian Heimes217cfd12007-12-02 14:31:20 +00007123 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 stringlib_count(self->str + start, end - start,
7125 substring->str, substring->length)
7126 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
7128 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007129
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return result;
7131}
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007136Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007137to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007138handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7140'xmlcharrefreplace' as well as any other name registered with\n\
7141codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142
7143static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007144unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007146 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 char *encoding = NULL;
7148 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007149 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007150
Benjamin Peterson308d6372009-09-18 21:42:35 +00007151 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7152 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007154 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007155 if (v == NULL)
7156 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007157 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007158 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007159 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007160 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007161 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007162 Py_DECREF(v);
7163 return NULL;
7164 }
7165 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007166
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007168 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007169}
7170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007171PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173\n\
7174Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007175If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176
7177static PyObject*
7178unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7179{
7180 Py_UNICODE *e;
7181 Py_UNICODE *p;
7182 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007183 Py_UNICODE *qe;
7184 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 PyUnicodeObject *u;
7186 int tabsize = 8;
7187
7188 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
Thomas Wouters7e474022000-07-16 12:04:32 +00007191 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007192 i = 0; /* chars up to and including most recent \n or \r */
7193 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7194 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 for (p = self->str; p < e; p++)
7196 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 if (tabsize > 0) {
7198 incr = tabsize - (j % tabsize); /* cannot overflow */
7199 if (j > PY_SSIZE_T_MAX - incr)
7200 goto overflow1;
7201 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 if (j > PY_SSIZE_T_MAX - 1)
7206 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 j++;
7208 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 if (i > PY_SSIZE_T_MAX - j)
7210 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007212 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 }
7214 }
7215
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007216 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007218
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 /* Second pass: create output string and fill it */
7220 u = _PyUnicode_New(i + j);
7221 if (!u)
7222 return NULL;
7223
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007224 j = 0; /* same as in first pass */
7225 q = u->str; /* next output char */
7226 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228 for (p = self->str; p < e; p++)
7229 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 if (tabsize > 0) {
7231 i = tabsize - (j % tabsize);
7232 j += i;
7233 while (i--) {
7234 if (q >= qe)
7235 goto overflow2;
7236 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007237 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 else {
7241 if (q >= qe)
7242 goto overflow2;
7243 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007244 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 if (*p == '\n' || *p == '\r')
7246 j = 0;
7247 }
7248
7249 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007250
7251 overflow2:
7252 Py_DECREF(u);
7253 overflow1:
7254 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256}
7257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007258PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260\n\
7261Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007262such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263arguments start and end are interpreted as in slice notation.\n\
7264\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266
7267static PyObject *
7268unicode_find(PyUnicodeObject *self, PyObject *args)
7269{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007270 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007271 Py_ssize_t start;
7272 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007273 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
Christian Heimes9cd17752007-11-18 19:35:23 +00007275 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
Thomas Wouters477c8d52006-05-27 19:21:47 +00007278 result = stringlib_find_slice(
7279 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7280 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7281 start, end
7282 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007285
Christian Heimes217cfd12007-12-02 14:31:20 +00007286 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287}
7288
7289static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291{
7292 if (index < 0 || index >= self->length) {
7293 PyErr_SetString(PyExc_IndexError, "string index out of range");
7294 return NULL;
7295 }
7296
7297 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7298}
7299
Guido van Rossumc2504932007-09-18 19:42:40 +00007300/* Believe it or not, this produces the same value for ASCII strings
7301 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007303unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304{
Guido van Rossumc2504932007-09-18 19:42:40 +00007305 Py_ssize_t len;
7306 Py_UNICODE *p;
7307 long x;
7308
7309 if (self->hash != -1)
7310 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007311 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007312 p = self->str;
7313 x = *p << 7;
7314 while (--len >= 0)
7315 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007316 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007317 if (x == -1)
7318 x = -2;
7319 self->hash = x;
7320 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321}
7322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
7328static PyObject *
7329unicode_index(PyUnicodeObject *self, PyObject *args)
7330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007331 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007332 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007333 Py_ssize_t start;
7334 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
Christian Heimes9cd17752007-11-18 19:35:23 +00007336 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 result = stringlib_find_slice(
7340 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7341 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7342 start, end
7343 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344
7345 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 if (result < 0) {
7348 PyErr_SetString(PyExc_ValueError, "substring not found");
7349 return NULL;
7350 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351
Christian Heimes217cfd12007-12-02 14:31:20 +00007352 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353}
7354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007358Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360
7361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007362unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363{
7364 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7365 register const Py_UNICODE *e;
7366 int cased;
7367
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 /* Shortcut for single character strings */
7369 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007372 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007373 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007375
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 e = p + PyUnicode_GET_SIZE(self);
7377 cased = 0;
7378 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007380
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7382 return PyBool_FromLong(0);
7383 else if (!cased && Py_UNICODE_ISLOWER(ch))
7384 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007386 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387}
7388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007389PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007392Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
7395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007396unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397{
7398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7399 register const Py_UNICODE *e;
7400 int cased;
7401
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 /* Shortcut for single character strings */
7403 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007406 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007407 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007409
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 e = p + PyUnicode_GET_SIZE(self);
7411 cased = 0;
7412 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007414
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7416 return PyBool_FromLong(0);
7417 else if (!cased && Py_UNICODE_ISUPPER(ch))
7418 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421}
7422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007423PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007426Return True if S is a titlecased string and there is at least one\n\
7427character in S, i.e. upper- and titlecase characters may only\n\
7428follow uncased characters and lowercase characters only cased ones.\n\
7429Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
7431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007432unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433{
7434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7435 register const Py_UNICODE *e;
7436 int cased, previous_is_cased;
7437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 /* Shortcut for single character strings */
7439 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7441 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007444 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 e = p + PyUnicode_GET_SIZE(self);
7448 cased = 0;
7449 previous_is_cased = 0;
7450 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007452
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7454 if (previous_is_cased)
7455 return PyBool_FromLong(0);
7456 previous_is_cased = 1;
7457 cased = 1;
7458 }
7459 else if (Py_UNICODE_ISLOWER(ch)) {
7460 if (!previous_is_cased)
7461 return PyBool_FromLong(0);
7462 previous_is_cased = 1;
7463 cased = 1;
7464 }
7465 else
7466 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007468 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469}
7470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007471PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007474Return True if all characters in S are whitespace\n\
7475and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007478unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
7480 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7481 register const Py_UNICODE *e;
7482
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 /* Shortcut for single character strings */
7484 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 Py_UNICODE_ISSPACE(*p))
7486 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007488 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007489 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007491
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 e = p + PyUnicode_GET_SIZE(self);
7493 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 if (!Py_UNICODE_ISSPACE(*p))
7495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007502\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007503Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007505
7506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007507unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007508{
7509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7510 register const Py_UNICODE *e;
7511
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007512 /* Shortcut for single character strings */
7513 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 Py_UNICODE_ISALPHA(*p))
7515 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007516
7517 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007518 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007520
7521 e = p + PyUnicode_GET_SIZE(self);
7522 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 if (!Py_UNICODE_ISALPHA(*p))
7524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007526 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007527}
7528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007529PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007532Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007534
7535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007536unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007537{
7538 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7539 register const Py_UNICODE *e;
7540
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541 /* Shortcut for single character strings */
7542 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 Py_UNICODE_ISALNUM(*p))
7544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007545
7546 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007547 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549
7550 e = p + PyUnicode_GET_SIZE(self);
7551 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 if (!Py_UNICODE_ISALNUM(*p))
7553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007555 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007556}
7557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007561Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
7564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007565unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
7567 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7568 register const Py_UNICODE *e;
7569
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 /* Shortcut for single character strings */
7571 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 Py_UNICODE_ISDECIMAL(*p))
7573 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007575 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007576 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 e = p + PyUnicode_GET_SIZE(self);
7580 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 if (!Py_UNICODE_ISDECIMAL(*p))
7582 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007584 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585}
7586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007587PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007590Return True if all characters in S are digits\n\
7591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007594unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595{
7596 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7597 register const Py_UNICODE *e;
7598
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 /* Shortcut for single character strings */
7600 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 Py_UNICODE_ISDIGIT(*p))
7602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007604 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007605 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007607
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 e = p + PyUnicode_GET_SIZE(self);
7609 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (!Py_UNICODE_ISDIGIT(*p))
7611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007619Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624{
7625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7626 register const Py_UNICODE *e;
7627
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 /* Shortcut for single character strings */
7629 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 Py_UNICODE_ISNUMERIC(*p))
7631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007634 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 e = p + PyUnicode_GET_SIZE(self);
7638 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 if (!Py_UNICODE_ISNUMERIC(*p))
7640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis47383402007-08-15 07:32:56 +00007645int
7646PyUnicode_IsIdentifier(PyObject *self)
7647{
7648 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7649 register const Py_UNICODE *e;
7650
7651 /* Special case for empty strings */
7652 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007654
7655 /* PEP 3131 says that the first character must be in
7656 XID_Start and subsequent characters in XID_Continue,
7657 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007658 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007659 letters, digits, underscore). However, given the current
7660 definition of XID_Start and XID_Continue, it is sufficient
7661 to check just for these, except that _ must be allowed
7662 as starting an identifier. */
7663 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7664 return 0;
7665
7666 e = p + PyUnicode_GET_SIZE(self);
7667 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 if (!_PyUnicode_IsXidContinue(*p))
7669 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007670 }
7671 return 1;
7672}
7673
7674PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007676\n\
7677Return True if S is a valid identifier according\n\
7678to the language definition.");
7679
7680static PyObject*
7681unicode_isidentifier(PyObject *self)
7682{
7683 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7684}
7685
Georg Brandl559e5d72008-06-11 18:37:52 +00007686PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007688\n\
7689Return True if all characters in S are considered\n\
7690printable in repr() or S is empty, False otherwise.");
7691
7692static PyObject*
7693unicode_isprintable(PyObject *self)
7694{
7695 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7696 register const Py_UNICODE *e;
7697
7698 /* Shortcut for single character strings */
7699 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7700 Py_RETURN_TRUE;
7701 }
7702
7703 e = p + PyUnicode_GET_SIZE(self);
7704 for (; p < e; p++) {
7705 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7706 Py_RETURN_FALSE;
7707 }
7708 }
7709 Py_RETURN_TRUE;
7710}
7711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007713 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714\n\
7715Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007716iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
7718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007719unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
Martin v. Löwis18e16552006-02-15 17:27:45 +00007724static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725unicode_length(PyUnicodeObject *self)
7726{
7727 return self->length;
7728}
7729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007730PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007733Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007734done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
7736static PyObject *
7737unicode_ljust(PyUnicodeObject *self, PyObject *args)
7738{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007739 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007740 Py_UNICODE fillchar = ' ';
7741
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007742 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 return NULL;
7744
Tim Peters7a29bd52001-09-12 03:03:31 +00007745 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 Py_INCREF(self);
7747 return (PyObject*) self;
7748 }
7749
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007750 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751}
7752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007753PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
7758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007759unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 return fixup(self, fixlower);
7762}
7763
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007764#define LEFTSTRIP 0
7765#define RIGHTSTRIP 1
7766#define BOTHSTRIP 2
7767
7768/* Arrays indexed by above */
7769static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7770
7771#define STRIPNAME(i) (stripformat[i]+3)
7772
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007773/* externally visible for str.strip(unicode) */
7774PyObject *
7775_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7778 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7779 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7780 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7781 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007782
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007784
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 i = 0;
7786 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7788 i++;
7789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 j = len;
7793 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 do {
7795 j--;
7796 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7797 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007798 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007799
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 Py_INCREF(self);
7802 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007803 }
7804 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806}
7807
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007810do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7813 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 i = 0;
7816 if (striptype != RIGHTSTRIP) {
7817 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7818 i++;
7819 }
7820 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007821
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 j = len;
7823 if (striptype != LEFTSTRIP) {
7824 do {
7825 j--;
7826 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7827 j++;
7828 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007829
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7831 Py_INCREF(self);
7832 return (PyObject*)self;
7833 }
7834 else
7835 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836}
7837
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838
7839static PyObject *
7840do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007842 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7845 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007846
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 if (sep != NULL && sep != Py_None) {
7848 if (PyUnicode_Check(sep))
7849 return _PyUnicode_XStrip(self, striptype, sep);
7850 else {
7851 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 "%s arg must be None or str",
7853 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 return NULL;
7855 }
7856 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007857
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859}
7860
7861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007862PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007864\n\
7865Return a copy of the string S with leading and trailing\n\
7866whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007867If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007868
7869static PyObject *
7870unicode_strip(PyUnicodeObject *self, PyObject *args)
7871{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 if (PyTuple_GET_SIZE(args) == 0)
7873 return do_strip(self, BOTHSTRIP); /* Common case */
7874 else
7875 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876}
7877
7878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007879PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007881\n\
7882Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007883If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884
7885static PyObject *
7886unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7887{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 if (PyTuple_GET_SIZE(args) == 0)
7889 return do_strip(self, LEFTSTRIP); /* Common case */
7890 else
7891 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892}
7893
7894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007895PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007897\n\
7898Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007899If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007900
7901static PyObject *
7902unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7903{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007904 if (PyTuple_GET_SIZE(args) == 0)
7905 return do_strip(self, RIGHTSTRIP); /* Common case */
7906 else
7907 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908}
7909
7910
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007912unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913{
7914 PyUnicodeObject *u;
7915 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007916 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007917 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918
Georg Brandl222de0f2009-04-12 12:01:50 +00007919 if (len < 1) {
7920 Py_INCREF(unicode_empty);
7921 return (PyObject *)unicode_empty;
7922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923
Tim Peters7a29bd52001-09-12 03:03:31 +00007924 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 /* no repeat, return original string */
7926 Py_INCREF(str);
7927 return (PyObject*) str;
7928 }
Tim Peters8f422462000-09-09 06:13:41 +00007929
7930 /* ensure # of chars needed doesn't overflow int and # of bytes
7931 * needed doesn't overflow size_t
7932 */
7933 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007934 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007935 PyErr_SetString(PyExc_OverflowError,
7936 "repeated string is too long");
7937 return NULL;
7938 }
7939 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7940 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7941 PyErr_SetString(PyExc_OverflowError,
7942 "repeated string is too long");
7943 return NULL;
7944 }
7945 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 if (!u)
7947 return NULL;
7948
7949 p = u->str;
7950
Georg Brandl222de0f2009-04-12 12:01:50 +00007951 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952 Py_UNICODE_FILL(p, str->str[0], len);
7953 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007954 Py_ssize_t done = str->length; /* number of characters copied this far */
7955 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007957 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007958 Py_UNICODE_COPY(p+done, p, n);
7959 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 }
7962
7963 return (PyObject*) u;
7964}
7965
7966PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 PyObject *subobj,
7968 PyObject *replobj,
7969 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970{
7971 PyObject *self;
7972 PyObject *str1;
7973 PyObject *str2;
7974 PyObject *result;
7975
7976 self = PyUnicode_FromObject(obj);
7977 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 str1 = PyUnicode_FromObject(subobj);
7980 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 Py_DECREF(self);
7982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 }
7984 str2 = PyUnicode_FromObject(replobj);
7985 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 Py_DECREF(self);
7987 Py_DECREF(str1);
7988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 }
Tim Petersced69f82003-09-16 20:30:58 +00007990 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 (PyUnicodeObject *)str1,
7992 (PyUnicodeObject *)str2,
7993 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 Py_DECREF(self);
7995 Py_DECREF(str1);
7996 Py_DECREF(str2);
7997 return result;
7998}
7999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002\n\
8003Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008004old replaced by new. If the optional argument count is\n\
8005given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
8007static PyObject*
8008unicode_replace(PyUnicodeObject *self, PyObject *args)
8009{
8010 PyUnicodeObject *str1;
8011 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 PyObject *result;
8014
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 return NULL;
8017 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8018 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008021 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 Py_DECREF(str1);
8023 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
8026 result = replace(self, str1, str2, maxcount);
8027
8028 Py_DECREF(str1);
8029 Py_DECREF(str2);
8030 return result;
8031}
8032
8033static
8034PyObject *unicode_repr(PyObject *unicode)
8035{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008036 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008037 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008038 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8039 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8040
8041 /* XXX(nnorwitz): rather than over-allocating, it would be
8042 better to choose a different scheme. Perhaps scan the
8043 first N-chars of the string and allocate based on that size.
8044 */
8045 /* Initial allocation is based on the longest-possible unichr
8046 escape.
8047
8048 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8049 unichr, so in this case it's the longest unichr escape. In
8050 narrow (UTF-16) builds this is five chars per source unichr
8051 since there are two unichrs in the surrogate pair, so in narrow
8052 (UTF-16) builds it's not the longest unichr escape.
8053
8054 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8055 so in the narrow (UTF-16) build case it's the longest unichr
8056 escape.
8057 */
8058
Walter Dörwald1ab83302007-05-18 17:15:44 +00008059 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008061#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008063#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008065#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067 if (repr == NULL)
8068 return NULL;
8069
Walter Dörwald1ab83302007-05-18 17:15:44 +00008070 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008071
8072 /* Add quote */
8073 *p++ = (findchar(s, size, '\'') &&
8074 !findchar(s, size, '"')) ? '"' : '\'';
8075 while (size-- > 0) {
8076 Py_UNICODE ch = *s++;
8077
8078 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008079 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008080 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008081 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008082 continue;
8083 }
8084
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008086 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008087 *p++ = '\\';
8088 *p++ = 't';
8089 }
8090 else if (ch == '\n') {
8091 *p++ = '\\';
8092 *p++ = 'n';
8093 }
8094 else if (ch == '\r') {
8095 *p++ = '\\';
8096 *p++ = 'r';
8097 }
8098
8099 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008100 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008101 *p++ = '\\';
8102 *p++ = 'x';
8103 *p++ = hexdigits[(ch >> 4) & 0x000F];
8104 *p++ = hexdigits[ch & 0x000F];
8105 }
8106
Georg Brandl559e5d72008-06-11 18:37:52 +00008107 /* Copy ASCII characters as-is */
8108 else if (ch < 0x7F) {
8109 *p++ = ch;
8110 }
8111
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008113 else {
8114 Py_UCS4 ucs = ch;
8115
8116#ifndef Py_UNICODE_WIDE
8117 Py_UNICODE ch2 = 0;
8118 /* Get code point from surrogate pair */
8119 if (size > 0) {
8120 ch2 = *s;
8121 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008126 size--;
8127 }
8128 }
8129#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008131 (categories Z* and C* except ASCII space)
8132 */
8133 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8134 /* Map 8-bit characters to '\xhh' */
8135 if (ucs <= 0xff) {
8136 *p++ = '\\';
8137 *p++ = 'x';
8138 *p++ = hexdigits[(ch >> 4) & 0x000F];
8139 *p++ = hexdigits[ch & 0x000F];
8140 }
8141 /* Map 21-bit characters to '\U00xxxxxx' */
8142 else if (ucs >= 0x10000) {
8143 *p++ = '\\';
8144 *p++ = 'U';
8145 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8146 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8147 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8148 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8149 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8150 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8151 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8152 *p++ = hexdigits[ucs & 0x0000000F];
8153 }
8154 /* Map 16-bit characters to '\uxxxx' */
8155 else {
8156 *p++ = '\\';
8157 *p++ = 'u';
8158 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8159 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8160 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8161 *p++ = hexdigits[ucs & 0x000F];
8162 }
8163 }
8164 /* Copy characters as-is */
8165 else {
8166 *p++ = ch;
8167#ifndef Py_UNICODE_WIDE
8168 if (ucs >= 0x10000)
8169 *p++ = ch2;
8170#endif
8171 }
8172 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008173 }
8174 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008175 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008176
8177 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008178 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008179 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180}
8181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008182PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184\n\
8185Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008186such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187arguments start and end are interpreted as in slice notation.\n\
8188\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008189Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190
8191static PyObject *
8192unicode_rfind(PyUnicodeObject *self, PyObject *args)
8193{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008195 Py_ssize_t start;
8196 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198
Christian Heimes9cd17752007-11-18 19:35:23 +00008199 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201
Thomas Wouters477c8d52006-05-27 19:21:47 +00008202 result = stringlib_rfind_slice(
8203 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8204 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8205 start, end
8206 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
8208 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209
Christian Heimes217cfd12007-12-02 14:31:20 +00008210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211}
8212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008213PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008216Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217
8218static PyObject *
8219unicode_rindex(PyUnicodeObject *self, PyObject *args)
8220{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008221 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008222 Py_ssize_t start;
8223 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008224 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
Christian Heimes9cd17752007-11-18 19:35:23 +00008226 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 result = stringlib_rfind_slice(
8230 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8231 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8232 start, end
8233 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234
8235 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 if (result < 0) {
8238 PyErr_SetString(PyExc_ValueError, "substring not found");
8239 return NULL;
8240 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008241 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242}
8243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008244PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008247Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008248done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
8250static PyObject *
8251unicode_rjust(PyUnicodeObject *self, PyObject *args)
8252{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008253 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008254 Py_UNICODE fillchar = ' ';
8255
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008256 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 return NULL;
8258
Tim Peters7a29bd52001-09-12 03:03:31 +00008259 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 Py_INCREF(self);
8261 return (PyObject*) self;
8262 }
8263
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008264 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265}
8266
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 PyObject *sep,
8269 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
8271 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008272
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 s = PyUnicode_FromObject(s);
8274 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 if (sep != NULL) {
8277 sep = PyUnicode_FromObject(sep);
8278 if (sep == NULL) {
8279 Py_DECREF(s);
8280 return NULL;
8281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
8283
8284 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8285
8286 Py_DECREF(s);
8287 Py_XDECREF(sep);
8288 return result;
8289}
8290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008291PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293\n\
8294Return a list of the words in S, using sep as the\n\
8295delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008296splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008297whitespace string is a separator and empty strings are\n\
8298removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299
8300static PyObject*
8301unicode_split(PyUnicodeObject *self, PyObject *args)
8302{
8303 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
Martin v. Löwis18e16552006-02-15 17:27:45 +00008306 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return NULL;
8308
8309 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315}
8316
Thomas Wouters477c8d52006-05-27 19:21:47 +00008317PyObject *
8318PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8319{
8320 PyObject* str_obj;
8321 PyObject* sep_obj;
8322 PyObject* out;
8323
8324 str_obj = PyUnicode_FromObject(str_in);
8325 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008327 sep_obj = PyUnicode_FromObject(sep_in);
8328 if (!sep_obj) {
8329 Py_DECREF(str_obj);
8330 return NULL;
8331 }
8332
8333 out = stringlib_partition(
8334 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8335 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8336 );
8337
8338 Py_DECREF(sep_obj);
8339 Py_DECREF(str_obj);
8340
8341 return out;
8342}
8343
8344
8345PyObject *
8346PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8347{
8348 PyObject* str_obj;
8349 PyObject* sep_obj;
8350 PyObject* out;
8351
8352 str_obj = PyUnicode_FromObject(str_in);
8353 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355 sep_obj = PyUnicode_FromObject(sep_in);
8356 if (!sep_obj) {
8357 Py_DECREF(str_obj);
8358 return NULL;
8359 }
8360
8361 out = stringlib_rpartition(
8362 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8363 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8364 );
8365
8366 Py_DECREF(sep_obj);
8367 Py_DECREF(str_obj);
8368
8369 return out;
8370}
8371
8372PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008375Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008376the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008377found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378
8379static PyObject*
8380unicode_partition(PyUnicodeObject *self, PyObject *separator)
8381{
8382 return PyUnicode_Partition((PyObject *)self, separator);
8383}
8384
8385PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008388Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008390separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008391
8392static PyObject*
8393unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8394{
8395 return PyUnicode_RPartition((PyObject *)self, separator);
8396}
8397
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008398PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 PyObject *sep,
8400 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008401{
8402 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008404 s = PyUnicode_FromObject(s);
8405 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 if (sep != NULL) {
8408 sep = PyUnicode_FromObject(sep);
8409 if (sep == NULL) {
8410 Py_DECREF(s);
8411 return NULL;
8412 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008413 }
8414
8415 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8416
8417 Py_DECREF(s);
8418 Py_XDECREF(sep);
8419 return result;
8420}
8421
8422PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008424\n\
8425Return a list of the words in S, using sep as the\n\
8426delimiter string, starting at the end of the string and\n\
8427working to the front. If maxsplit is given, at most maxsplit\n\
8428splits are done. If sep is not specified, any whitespace string\n\
8429is a separator.");
8430
8431static PyObject*
8432unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8433{
8434 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008436
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008438 return NULL;
8439
8440 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008442 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008444 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008446}
8447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008448PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450\n\
8451Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008452Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008453is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454
8455static PyObject*
8456unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8457{
Guido van Rossum86662912000-04-11 15:38:46 +00008458 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459
Guido van Rossum86662912000-04-11 15:38:46 +00008460 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 return NULL;
8462
Guido van Rossum86662912000-04-11 15:38:46 +00008463 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464}
8465
8466static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008467PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468{
Walter Dörwald346737f2007-05-31 10:44:43 +00008469 if (PyUnicode_CheckExact(self)) {
8470 Py_INCREF(self);
8471 return self;
8472 } else
8473 /* Subtype -- return genuine unicode string with the same value. */
8474 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8475 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476}
8477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008478PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480\n\
8481Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008482and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483
8484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008485unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 return fixup(self, fixswapcase);
8488}
8489
Georg Brandlceee0772007-11-27 23:48:05 +00008490PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008492\n\
8493Return a translation table usable for str.translate().\n\
8494If there is only one argument, it must be a dictionary mapping Unicode\n\
8495ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008496Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008497If there are two arguments, they must be strings of equal length, and\n\
8498in the resulting dictionary, each character in x will be mapped to the\n\
8499character at the same position in y. If there is a third argument, it\n\
8500must be a string, whose characters will be mapped to None in the result.");
8501
8502static PyObject*
8503unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8504{
8505 PyObject *x, *y = NULL, *z = NULL;
8506 PyObject *new = NULL, *key, *value;
8507 Py_ssize_t i = 0;
8508 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509
Georg Brandlceee0772007-11-27 23:48:05 +00008510 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8511 return NULL;
8512 new = PyDict_New();
8513 if (!new)
8514 return NULL;
8515 if (y != NULL) {
8516 /* x must be a string too, of equal length */
8517 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8518 if (!PyUnicode_Check(x)) {
8519 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8520 "be a string if there is a second argument");
8521 goto err;
8522 }
8523 if (PyUnicode_GET_SIZE(x) != ylen) {
8524 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8525 "arguments must have equal length");
8526 goto err;
8527 }
8528 /* create entries for translating chars in x to those in y */
8529 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008530 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8531 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008532 if (!key || !value)
8533 goto err;
8534 res = PyDict_SetItem(new, key, value);
8535 Py_DECREF(key);
8536 Py_DECREF(value);
8537 if (res < 0)
8538 goto err;
8539 }
8540 /* create entries for deleting chars in z */
8541 if (z != NULL) {
8542 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008543 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008544 if (!key)
8545 goto err;
8546 res = PyDict_SetItem(new, key, Py_None);
8547 Py_DECREF(key);
8548 if (res < 0)
8549 goto err;
8550 }
8551 }
8552 } else {
8553 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008554 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008555 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8556 "to maketrans it must be a dict");
8557 goto err;
8558 }
8559 /* copy entries into the new dict, converting string keys to int keys */
8560 while (PyDict_Next(x, &i, &key, &value)) {
8561 if (PyUnicode_Check(key)) {
8562 /* convert string keys to integer keys */
8563 PyObject *newkey;
8564 if (PyUnicode_GET_SIZE(key) != 1) {
8565 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8566 "table must be of length 1");
8567 goto err;
8568 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008569 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008570 if (!newkey)
8571 goto err;
8572 res = PyDict_SetItem(new, newkey, value);
8573 Py_DECREF(newkey);
8574 if (res < 0)
8575 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008576 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008577 /* just keep integer keys */
8578 if (PyDict_SetItem(new, key, value) < 0)
8579 goto err;
8580 } else {
8581 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8582 "be strings or integers");
8583 goto err;
8584 }
8585 }
8586 }
8587 return new;
8588 err:
8589 Py_DECREF(new);
8590 return NULL;
8591}
8592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008593PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595\n\
8596Return a copy of the string S, where all characters have been mapped\n\
8597through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008598Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008599Unmapped characters are left untouched. Characters mapped to None\n\
8600are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
8602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008603unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604{
Georg Brandlceee0772007-11-27 23:48:05 +00008605 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008608PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008611Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
8613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008614unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 return fixup(self, fixupper);
8617}
8618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008619PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008622Pad a numeric string S with zeros on the left, to fill a field\n\
8623of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624
8625static PyObject *
8626unicode_zfill(PyUnicodeObject *self, PyObject *args)
8627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008628 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 PyUnicodeObject *u;
8630
Martin v. Löwis18e16552006-02-15 17:27:45 +00008631 Py_ssize_t width;
8632 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 return NULL;
8634
8635 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008636 if (PyUnicode_CheckExact(self)) {
8637 Py_INCREF(self);
8638 return (PyObject*) self;
8639 }
8640 else
8641 return PyUnicode_FromUnicode(
8642 PyUnicode_AS_UNICODE(self),
8643 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 }
8646
8647 fill = width - self->length;
8648
8649 u = pad(self, fill, 0, '0');
8650
Walter Dörwald068325e2002-04-15 13:36:47 +00008651 if (u == NULL)
8652 return NULL;
8653
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 if (u->str[fill] == '+' || u->str[fill] == '-') {
8655 /* move sign to beginning of string */
8656 u->str[0] = u->str[fill];
8657 u->str[fill] = '0';
8658 }
8659
8660 return (PyObject*) u;
8661}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662
8663#if 0
8664static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008665unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666{
Christian Heimes2202f872008-02-06 14:31:34 +00008667 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668}
8669#endif
8670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008671PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008674Return True if S starts with the specified prefix, False otherwise.\n\
8675With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008676With optional end, stop comparing S at that position.\n\
8677prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678
8679static PyObject *
8680unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008685 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008686 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008687 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008689 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8691 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008692 if (PyTuple_Check(subobj)) {
8693 Py_ssize_t i;
8694 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8695 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008697 if (substring == NULL)
8698 return NULL;
8699 result = tailmatch(self, substring, start, end, -1);
8700 Py_DECREF(substring);
8701 if (result) {
8702 Py_RETURN_TRUE;
8703 }
8704 }
8705 /* nothing matched */
8706 Py_RETURN_FALSE;
8707 }
8708 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008711 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008713 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714}
8715
8716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008717PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008720Return True if S ends with the specified suffix, False otherwise.\n\
8721With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008722With optional end, stop comparing S at that position.\n\
8723suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
8725static PyObject *
8726unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008729 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008731 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008732 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008733 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008735 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8737 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 if (PyTuple_Check(subobj)) {
8739 Py_ssize_t i;
8740 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8741 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008743 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008745 result = tailmatch(self, substring, start, end, +1);
8746 Py_DECREF(substring);
8747 if (result) {
8748 Py_RETURN_TRUE;
8749 }
8750 }
8751 Py_RETURN_FALSE;
8752 }
8753 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008757 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008759 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760}
8761
Eric Smith8c663262007-08-25 02:26:07 +00008762#include "stringlib/string_format.h"
8763
8764PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008766\n\
8767");
8768
Eric Smith4a7d76d2008-05-30 18:10:19 +00008769static PyObject *
8770unicode__format__(PyObject* self, PyObject* args)
8771{
8772 PyObject *format_spec;
8773
8774 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8775 return NULL;
8776
8777 return _PyUnicode_FormatAdvanced(self,
8778 PyUnicode_AS_UNICODE(format_spec),
8779 PyUnicode_GET_SIZE(format_spec));
8780}
8781
Eric Smith8c663262007-08-25 02:26:07 +00008782PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008784\n\
8785");
8786
8787static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008788unicode__sizeof__(PyUnicodeObject *v)
8789{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008790 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8791 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008792}
8793
8794PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008796
8797static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008798unicode_getnewargs(PyUnicodeObject *v)
8799{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008800 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008801}
8802
8803
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804static PyMethodDef unicode_methods[] = {
8805
8806 /* Order is according to common usage: often used methods should
8807 appear first, since lookup is done sequentially. */
8808
Benjamin Peterson308d6372009-09-18 21:42:35 +00008809 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008810 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8811 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008812 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008813 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8814 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8815 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8816 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8817 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8818 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8819 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008820 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008821 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8822 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8823 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008824 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008825 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8826 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8827 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008828 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008829 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008830 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008831 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008832 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8833 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8834 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8835 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8836 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8837 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8838 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8839 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8840 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8841 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8842 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8843 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8844 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8845 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008846 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008847 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008848 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008849 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008850 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008851 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8852 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008853 {"maketrans", (PyCFunction) unicode_maketrans,
8854 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008855 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008856#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008857 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858#endif
8859
8860#if 0
8861 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008862 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863#endif
8864
Benjamin Peterson14339b62009-01-31 16:36:08 +00008865 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866 {NULL, NULL}
8867};
8868
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008869static PyObject *
8870unicode_mod(PyObject *v, PyObject *w)
8871{
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 if (!PyUnicode_Check(v)) {
8873 Py_INCREF(Py_NotImplemented);
8874 return Py_NotImplemented;
8875 }
8876 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008877}
8878
8879static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008880 0, /*nb_add*/
8881 0, /*nb_subtract*/
8882 0, /*nb_multiply*/
8883 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008884};
8885
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 (lenfunc) unicode_length, /* sq_length */
8888 PyUnicode_Concat, /* sq_concat */
8889 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8890 (ssizeargfunc) unicode_getitem, /* sq_item */
8891 0, /* sq_slice */
8892 0, /* sq_ass_item */
8893 0, /* sq_ass_slice */
8894 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895};
8896
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008897static PyObject*
8898unicode_subscript(PyUnicodeObject* self, PyObject* item)
8899{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008900 if (PyIndex_Check(item)) {
8901 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008902 if (i == -1 && PyErr_Occurred())
8903 return NULL;
8904 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008905 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008906 return unicode_getitem(self, i);
8907 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008908 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008909 Py_UNICODE* source_buf;
8910 Py_UNICODE* result_buf;
8911 PyObject* result;
8912
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008913 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008915 return NULL;
8916 }
8917
8918 if (slicelength <= 0) {
8919 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008920 } else if (start == 0 && step == 1 && slicelength == self->length &&
8921 PyUnicode_CheckExact(self)) {
8922 Py_INCREF(self);
8923 return (PyObject *)self;
8924 } else if (step == 1) {
8925 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008926 } else {
8927 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008928 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8929 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008930
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 if (result_buf == NULL)
8932 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008933
8934 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8935 result_buf[i] = source_buf[cur];
8936 }
Tim Petersced69f82003-09-16 20:30:58 +00008937
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008938 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008939 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008940 return result;
8941 }
8942 } else {
8943 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8944 return NULL;
8945 }
8946}
8947
8948static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008949 (lenfunc)unicode_length, /* mp_length */
8950 (binaryfunc)unicode_subscript, /* mp_subscript */
8951 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008952};
8953
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955/* Helpers for PyUnicode_Format() */
8956
8957static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008958getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008960 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 (*p_argidx)++;
8963 if (arglen < 0)
8964 return args;
8965 else
8966 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 }
8968 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 return NULL;
8971}
8972
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008973/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008975static PyObject *
8976formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008978 char *p;
8979 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008981
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 x = PyFloat_AsDouble(v);
8983 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008984 return NULL;
8985
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008988
Eric Smith0923d1d2009-04-16 20:16:10 +00008989 p = PyOS_double_to_string(x, type, prec,
8990 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008991 if (p == NULL)
8992 return NULL;
8993 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008994 PyMem_Free(p);
8995 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996}
8997
Tim Peters38fd5b62000-09-21 05:43:11 +00008998static PyObject*
8999formatlong(PyObject *val, int flags, int prec, int type)
9000{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009001 char *buf;
9002 int len;
9003 PyObject *str; /* temporary string object. */
9004 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009005
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9007 if (!str)
9008 return NULL;
9009 result = PyUnicode_FromStringAndSize(buf, len);
9010 Py_DECREF(str);
9011 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009012}
9013
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014static int
9015formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009016 size_t buflen,
9017 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009019 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009020 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 if (PyUnicode_GET_SIZE(v) == 1) {
9022 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9023 buf[1] = '\0';
9024 return 1;
9025 }
9026#ifndef Py_UNICODE_WIDE
9027 if (PyUnicode_GET_SIZE(v) == 2) {
9028 /* Decode a valid surrogate pair */
9029 int c0 = PyUnicode_AS_UNICODE(v)[0];
9030 int c1 = PyUnicode_AS_UNICODE(v)[1];
9031 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9032 0xDC00 <= c1 && c1 <= 0xDFFF) {
9033 buf[0] = c0;
9034 buf[1] = c1;
9035 buf[2] = '\0';
9036 return 2;
9037 }
9038 }
9039#endif
9040 goto onError;
9041 }
9042 else {
9043 /* Integer input truncated to a character */
9044 long x;
9045 x = PyLong_AsLong(v);
9046 if (x == -1 && PyErr_Occurred())
9047 goto onError;
9048
9049 if (x < 0 || x > 0x10ffff) {
9050 PyErr_SetString(PyExc_OverflowError,
9051 "%c arg not in range(0x110000)");
9052 return -1;
9053 }
9054
9055#ifndef Py_UNICODE_WIDE
9056 if (x > 0xffff) {
9057 x -= 0x10000;
9058 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9059 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9060 return 2;
9061 }
9062#endif
9063 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009064 buf[1] = '\0';
9065 return 1;
9066 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009067
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009069 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009071 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009074/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009075 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009076*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009077#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009078
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
9082 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009083 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 int args_owned = 0;
9085 PyUnicodeObject *result = NULL;
9086 PyObject *dict = NULL;
9087 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009088
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 PyErr_BadInternalCall();
9091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 }
9093 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009094 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 fmt = PyUnicode_AS_UNICODE(uformat);
9097 fmtcnt = PyUnicode_GET_SIZE(uformat);
9098
9099 reslen = rescnt = fmtcnt + 100;
9100 result = _PyUnicode_New(reslen);
9101 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 res = PyUnicode_AS_UNICODE(result);
9104
9105 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 arglen = PyTuple_Size(args);
9107 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
9109 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 arglen = -1;
9111 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009113 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009114 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116
9117 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 if (*fmt != '%') {
9119 if (--rescnt < 0) {
9120 rescnt = fmtcnt + 100;
9121 reslen += rescnt;
9122 if (_PyUnicode_Resize(&result, reslen) < 0)
9123 goto onError;
9124 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9125 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009126 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009128 }
9129 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 /* Got a format specifier */
9131 int flags = 0;
9132 Py_ssize_t width = -1;
9133 int prec = -1;
9134 Py_UNICODE c = '\0';
9135 Py_UNICODE fill;
9136 int isnumok;
9137 PyObject *v = NULL;
9138 PyObject *temp = NULL;
9139 Py_UNICODE *pbuf;
9140 Py_UNICODE sign;
9141 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009142 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 fmt++;
9145 if (*fmt == '(') {
9146 Py_UNICODE *keystart;
9147 Py_ssize_t keylen;
9148 PyObject *key;
9149 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009150
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 if (dict == NULL) {
9152 PyErr_SetString(PyExc_TypeError,
9153 "format requires a mapping");
9154 goto onError;
9155 }
9156 ++fmt;
9157 --fmtcnt;
9158 keystart = fmt;
9159 /* Skip over balanced parentheses */
9160 while (pcount > 0 && --fmtcnt >= 0) {
9161 if (*fmt == ')')
9162 --pcount;
9163 else if (*fmt == '(')
9164 ++pcount;
9165 fmt++;
9166 }
9167 keylen = fmt - keystart - 1;
9168 if (fmtcnt < 0 || pcount > 0) {
9169 PyErr_SetString(PyExc_ValueError,
9170 "incomplete format key");
9171 goto onError;
9172 }
9173#if 0
9174 /* keys are converted to strings using UTF-8 and
9175 then looked up since Python uses strings to hold
9176 variables names etc. in its namespaces and we
9177 wouldn't want to break common idioms. */
9178 key = PyUnicode_EncodeUTF8(keystart,
9179 keylen,
9180 NULL);
9181#else
9182 key = PyUnicode_FromUnicode(keystart, keylen);
9183#endif
9184 if (key == NULL)
9185 goto onError;
9186 if (args_owned) {
9187 Py_DECREF(args);
9188 args_owned = 0;
9189 }
9190 args = PyObject_GetItem(dict, key);
9191 Py_DECREF(key);
9192 if (args == NULL) {
9193 goto onError;
9194 }
9195 args_owned = 1;
9196 arglen = -1;
9197 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 while (--fmtcnt >= 0) {
9200 switch (c = *fmt++) {
9201 case '-': flags |= F_LJUST; continue;
9202 case '+': flags |= F_SIGN; continue;
9203 case ' ': flags |= F_BLANK; continue;
9204 case '#': flags |= F_ALT; continue;
9205 case '0': flags |= F_ZERO; continue;
9206 }
9207 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009209 if (c == '*') {
9210 v = getnextarg(args, arglen, &argidx);
9211 if (v == NULL)
9212 goto onError;
9213 if (!PyLong_Check(v)) {
9214 PyErr_SetString(PyExc_TypeError,
9215 "* wants int");
9216 goto onError;
9217 }
9218 width = PyLong_AsLong(v);
9219 if (width == -1 && PyErr_Occurred())
9220 goto onError;
9221 if (width < 0) {
9222 flags |= F_LJUST;
9223 width = -width;
9224 }
9225 if (--fmtcnt >= 0)
9226 c = *fmt++;
9227 }
9228 else if (c >= '0' && c <= '9') {
9229 width = c - '0';
9230 while (--fmtcnt >= 0) {
9231 c = *fmt++;
9232 if (c < '0' || c > '9')
9233 break;
9234 if ((width*10) / 10 != width) {
9235 PyErr_SetString(PyExc_ValueError,
9236 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009237 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 }
9239 width = width*10 + (c - '0');
9240 }
9241 }
9242 if (c == '.') {
9243 prec = 0;
9244 if (--fmtcnt >= 0)
9245 c = *fmt++;
9246 if (c == '*') {
9247 v = getnextarg(args, arglen, &argidx);
9248 if (v == NULL)
9249 goto onError;
9250 if (!PyLong_Check(v)) {
9251 PyErr_SetString(PyExc_TypeError,
9252 "* wants int");
9253 goto onError;
9254 }
9255 prec = PyLong_AsLong(v);
9256 if (prec == -1 && PyErr_Occurred())
9257 goto onError;
9258 if (prec < 0)
9259 prec = 0;
9260 if (--fmtcnt >= 0)
9261 c = *fmt++;
9262 }
9263 else if (c >= '0' && c <= '9') {
9264 prec = c - '0';
9265 while (--fmtcnt >= 0) {
9266 c = Py_CHARMASK(*fmt++);
9267 if (c < '0' || c > '9')
9268 break;
9269 if ((prec*10) / 10 != prec) {
9270 PyErr_SetString(PyExc_ValueError,
9271 "prec too big");
9272 goto onError;
9273 }
9274 prec = prec*10 + (c - '0');
9275 }
9276 }
9277 } /* prec */
9278 if (fmtcnt >= 0) {
9279 if (c == 'h' || c == 'l' || c == 'L') {
9280 if (--fmtcnt >= 0)
9281 c = *fmt++;
9282 }
9283 }
9284 if (fmtcnt < 0) {
9285 PyErr_SetString(PyExc_ValueError,
9286 "incomplete format");
9287 goto onError;
9288 }
9289 if (c != '%') {
9290 v = getnextarg(args, arglen, &argidx);
9291 if (v == NULL)
9292 goto onError;
9293 }
9294 sign = 0;
9295 fill = ' ';
9296 switch (c) {
9297
9298 case '%':
9299 pbuf = formatbuf;
9300 /* presume that buffer length is at least 1 */
9301 pbuf[0] = '%';
9302 len = 1;
9303 break;
9304
9305 case 's':
9306 case 'r':
9307 case 'a':
9308 if (PyUnicode_Check(v) && c == 's') {
9309 temp = v;
9310 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009311 }
9312 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009313 if (c == 's')
9314 temp = PyObject_Str(v);
9315 else if (c == 'r')
9316 temp = PyObject_Repr(v);
9317 else
9318 temp = PyObject_ASCII(v);
9319 if (temp == NULL)
9320 goto onError;
9321 if (PyUnicode_Check(temp))
9322 /* nothing to do */;
9323 else {
9324 Py_DECREF(temp);
9325 PyErr_SetString(PyExc_TypeError,
9326 "%s argument has non-string str()");
9327 goto onError;
9328 }
9329 }
9330 pbuf = PyUnicode_AS_UNICODE(temp);
9331 len = PyUnicode_GET_SIZE(temp);
9332 if (prec >= 0 && len > prec)
9333 len = prec;
9334 break;
9335
9336 case 'i':
9337 case 'd':
9338 case 'u':
9339 case 'o':
9340 case 'x':
9341 case 'X':
9342 if (c == 'i')
9343 c = 'd';
9344 isnumok = 0;
9345 if (PyNumber_Check(v)) {
9346 PyObject *iobj=NULL;
9347
9348 if (PyLong_Check(v)) {
9349 iobj = v;
9350 Py_INCREF(iobj);
9351 }
9352 else {
9353 iobj = PyNumber_Long(v);
9354 }
9355 if (iobj!=NULL) {
9356 if (PyLong_Check(iobj)) {
9357 isnumok = 1;
9358 temp = formatlong(iobj, flags, prec, c);
9359 Py_DECREF(iobj);
9360 if (!temp)
9361 goto onError;
9362 pbuf = PyUnicode_AS_UNICODE(temp);
9363 len = PyUnicode_GET_SIZE(temp);
9364 sign = 1;
9365 }
9366 else {
9367 Py_DECREF(iobj);
9368 }
9369 }
9370 }
9371 if (!isnumok) {
9372 PyErr_Format(PyExc_TypeError,
9373 "%%%c format: a number is required, "
9374 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9375 goto onError;
9376 }
9377 if (flags & F_ZERO)
9378 fill = '0';
9379 break;
9380
9381 case 'e':
9382 case 'E':
9383 case 'f':
9384 case 'F':
9385 case 'g':
9386 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009387 temp = formatfloat(v, flags, prec, c);
9388 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009390 pbuf = PyUnicode_AS_UNICODE(temp);
9391 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 sign = 1;
9393 if (flags & F_ZERO)
9394 fill = '0';
9395 break;
9396
9397 case 'c':
9398 pbuf = formatbuf;
9399 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9400 if (len < 0)
9401 goto onError;
9402 break;
9403
9404 default:
9405 PyErr_Format(PyExc_ValueError,
9406 "unsupported format character '%c' (0x%x) "
9407 "at index %zd",
9408 (31<=c && c<=126) ? (char)c : '?',
9409 (int)c,
9410 (Py_ssize_t)(fmt - 1 -
9411 PyUnicode_AS_UNICODE(uformat)));
9412 goto onError;
9413 }
9414 if (sign) {
9415 if (*pbuf == '-' || *pbuf == '+') {
9416 sign = *pbuf++;
9417 len--;
9418 }
9419 else if (flags & F_SIGN)
9420 sign = '+';
9421 else if (flags & F_BLANK)
9422 sign = ' ';
9423 else
9424 sign = 0;
9425 }
9426 if (width < len)
9427 width = len;
9428 if (rescnt - (sign != 0) < width) {
9429 reslen -= rescnt;
9430 rescnt = width + fmtcnt + 100;
9431 reslen += rescnt;
9432 if (reslen < 0) {
9433 Py_XDECREF(temp);
9434 PyErr_NoMemory();
9435 goto onError;
9436 }
9437 if (_PyUnicode_Resize(&result, reslen) < 0) {
9438 Py_XDECREF(temp);
9439 goto onError;
9440 }
9441 res = PyUnicode_AS_UNICODE(result)
9442 + reslen - rescnt;
9443 }
9444 if (sign) {
9445 if (fill != ' ')
9446 *res++ = sign;
9447 rescnt--;
9448 if (width > len)
9449 width--;
9450 }
9451 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9452 assert(pbuf[0] == '0');
9453 assert(pbuf[1] == c);
9454 if (fill != ' ') {
9455 *res++ = *pbuf++;
9456 *res++ = *pbuf++;
9457 }
9458 rescnt -= 2;
9459 width -= 2;
9460 if (width < 0)
9461 width = 0;
9462 len -= 2;
9463 }
9464 if (width > len && !(flags & F_LJUST)) {
9465 do {
9466 --rescnt;
9467 *res++ = fill;
9468 } while (--width > len);
9469 }
9470 if (fill == ' ') {
9471 if (sign)
9472 *res++ = sign;
9473 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9474 assert(pbuf[0] == '0');
9475 assert(pbuf[1] == c);
9476 *res++ = *pbuf++;
9477 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009478 }
9479 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 Py_UNICODE_COPY(res, pbuf, len);
9481 res += len;
9482 rescnt -= len;
9483 while (--width >= len) {
9484 --rescnt;
9485 *res++ = ' ';
9486 }
9487 if (dict && (argidx < arglen) && c != '%') {
9488 PyErr_SetString(PyExc_TypeError,
9489 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009490 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 goto onError;
9492 }
9493 Py_XDECREF(temp);
9494 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 } /* until end */
9496 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 PyErr_SetString(PyExc_TypeError,
9498 "not all arguments converted during string formatting");
9499 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 }
9501
Thomas Woutersa96affe2006-03-12 00:29:36 +00009502 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 }
9507 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 return (PyObject *)result;
9509
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 Py_XDECREF(result);
9512 Py_DECREF(uformat);
9513 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
9516 return NULL;
9517}
9518
Jeremy Hylton938ace62002-07-17 16:30:39 +00009519static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009520unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9521
Tim Peters6d6c1a32001-08-02 04:15:00 +00009522static PyObject *
9523unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9524{
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 static char *kwlist[] = {"object", "encoding", "errors", 0};
9527 char *encoding = NULL;
9528 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009529
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530 if (type != &PyUnicode_Type)
9531 return unicode_subtype_new(type, args, kwds);
9532 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009534 return NULL;
9535 if (x == NULL)
9536 return (PyObject *)_PyUnicode_New(0);
9537 if (encoding == NULL && errors == NULL)
9538 return PyObject_Str(x);
9539 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009541}
9542
Guido van Rossume023fe02001-08-30 03:12:59 +00009543static PyObject *
9544unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9545{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009546 PyUnicodeObject *tmp, *pnew;
9547 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009548
Benjamin Peterson14339b62009-01-31 16:36:08 +00009549 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9550 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9551 if (tmp == NULL)
9552 return NULL;
9553 assert(PyUnicode_Check(tmp));
9554 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9555 if (pnew == NULL) {
9556 Py_DECREF(tmp);
9557 return NULL;
9558 }
9559 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9560 if (pnew->str == NULL) {
9561 _Py_ForgetReference((PyObject *)pnew);
9562 PyObject_Del(pnew);
9563 Py_DECREF(tmp);
9564 return PyErr_NoMemory();
9565 }
9566 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9567 pnew->length = n;
9568 pnew->hash = tmp->hash;
9569 Py_DECREF(tmp);
9570 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009571}
9572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009573PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009575\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009576Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009577encoding defaults to the current default string encoding.\n\
9578errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009579
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009580static PyObject *unicode_iter(PyObject *seq);
9581
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009583 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 "str", /* tp_name */
9585 sizeof(PyUnicodeObject), /* tp_size */
9586 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009588 (destructor)unicode_dealloc, /* tp_dealloc */
9589 0, /* tp_print */
9590 0, /* tp_getattr */
9591 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009592 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 unicode_repr, /* tp_repr */
9594 &unicode_as_number, /* tp_as_number */
9595 &unicode_as_sequence, /* tp_as_sequence */
9596 &unicode_as_mapping, /* tp_as_mapping */
9597 (hashfunc) unicode_hash, /* tp_hash*/
9598 0, /* tp_call*/
9599 (reprfunc) unicode_str, /* tp_str */
9600 PyObject_GenericGetAttr, /* tp_getattro */
9601 0, /* tp_setattro */
9602 0, /* tp_as_buffer */
9603 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 unicode_doc, /* tp_doc */
9606 0, /* tp_traverse */
9607 0, /* tp_clear */
9608 PyUnicode_RichCompare, /* tp_richcompare */
9609 0, /* tp_weaklistoffset */
9610 unicode_iter, /* tp_iter */
9611 0, /* tp_iternext */
9612 unicode_methods, /* tp_methods */
9613 0, /* tp_members */
9614 0, /* tp_getset */
9615 &PyBaseObject_Type, /* tp_base */
9616 0, /* tp_dict */
9617 0, /* tp_descr_get */
9618 0, /* tp_descr_set */
9619 0, /* tp_dictoffset */
9620 0, /* tp_init */
9621 0, /* tp_alloc */
9622 unicode_new, /* tp_new */
9623 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624};
9625
9626/* Initialize the Unicode implementation */
9627
Thomas Wouters78890102000-07-22 19:25:51 +00009628void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009630 int i;
9631
Thomas Wouters477c8d52006-05-27 19:21:47 +00009632 /* XXX - move this array to unicodectype.c ? */
9633 Py_UNICODE linebreak[] = {
9634 0x000A, /* LINE FEED */
9635 0x000D, /* CARRIAGE RETURN */
9636 0x001C, /* FILE SEPARATOR */
9637 0x001D, /* GROUP SEPARATOR */
9638 0x001E, /* RECORD SEPARATOR */
9639 0x0085, /* NEXT LINE */
9640 0x2028, /* LINE SEPARATOR */
9641 0x2029, /* PARAGRAPH SEPARATOR */
9642 };
9643
Fred Drakee4315f52000-05-09 19:53:39 +00009644 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009645 free_list = NULL;
9646 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009648 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009650
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009651 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009653 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009655
9656 /* initialize the linebreak bloom filter */
9657 bloom_linebreak = make_bloom_mask(
9658 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9659 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009660
9661 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662}
9663
9664/* Finalize the Unicode implementation */
9665
Christian Heimesa156e092008-02-16 07:38:31 +00009666int
9667PyUnicode_ClearFreeList(void)
9668{
9669 int freelist_size = numfree;
9670 PyUnicodeObject *u;
9671
9672 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 PyUnicodeObject *v = u;
9674 u = *(PyUnicodeObject **)u;
9675 if (v->str)
9676 PyObject_DEL(v->str);
9677 Py_XDECREF(v->defenc);
9678 PyObject_Del(v);
9679 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009680 }
9681 free_list = NULL;
9682 assert(numfree == 0);
9683 return freelist_size;
9684}
9685
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686void
Thomas Wouters78890102000-07-22 19:25:51 +00009687_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009689 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009691 Py_XDECREF(unicode_empty);
9692 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009693
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009694 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 if (unicode_latin1[i]) {
9696 Py_DECREF(unicode_latin1[i]);
9697 unicode_latin1[i] = NULL;
9698 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009699 }
Christian Heimesa156e092008-02-16 07:38:31 +00009700 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009702
Walter Dörwald16807132007-05-25 13:52:07 +00009703void
9704PyUnicode_InternInPlace(PyObject **p)
9705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9707 PyObject *t;
9708 if (s == NULL || !PyUnicode_Check(s))
9709 Py_FatalError(
9710 "PyUnicode_InternInPlace: unicode strings only please!");
9711 /* If it's a subclass, we don't really know what putting
9712 it in the interned dict might do. */
9713 if (!PyUnicode_CheckExact(s))
9714 return;
9715 if (PyUnicode_CHECK_INTERNED(s))
9716 return;
9717 if (interned == NULL) {
9718 interned = PyDict_New();
9719 if (interned == NULL) {
9720 PyErr_Clear(); /* Don't leave an exception */
9721 return;
9722 }
9723 }
9724 /* It might be that the GetItem call fails even
9725 though the key is present in the dictionary,
9726 namely when this happens during a stack overflow. */
9727 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009729 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009730
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 if (t) {
9732 Py_INCREF(t);
9733 Py_DECREF(*p);
9734 *p = t;
9735 return;
9736 }
Walter Dörwald16807132007-05-25 13:52:07 +00009737
Benjamin Peterson14339b62009-01-31 16:36:08 +00009738 PyThreadState_GET()->recursion_critical = 1;
9739 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9740 PyErr_Clear();
9741 PyThreadState_GET()->recursion_critical = 0;
9742 return;
9743 }
9744 PyThreadState_GET()->recursion_critical = 0;
9745 /* The two references in interned are not counted by refcnt.
9746 The deallocator will take care of this */
9747 Py_REFCNT(s) -= 2;
9748 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009749}
9750
9751void
9752PyUnicode_InternImmortal(PyObject **p)
9753{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009754 PyUnicode_InternInPlace(p);
9755 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9756 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9757 Py_INCREF(*p);
9758 }
Walter Dörwald16807132007-05-25 13:52:07 +00009759}
9760
9761PyObject *
9762PyUnicode_InternFromString(const char *cp)
9763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 PyObject *s = PyUnicode_FromString(cp);
9765 if (s == NULL)
9766 return NULL;
9767 PyUnicode_InternInPlace(&s);
9768 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009769}
9770
9771void _Py_ReleaseInternedUnicodeStrings(void)
9772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773 PyObject *keys;
9774 PyUnicodeObject *s;
9775 Py_ssize_t i, n;
9776 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009777
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 if (interned == NULL || !PyDict_Check(interned))
9779 return;
9780 keys = PyDict_Keys(interned);
9781 if (keys == NULL || !PyList_Check(keys)) {
9782 PyErr_Clear();
9783 return;
9784 }
Walter Dörwald16807132007-05-25 13:52:07 +00009785
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9787 detector, interned unicode strings are not forcibly deallocated;
9788 rather, we give them their stolen references back, and then clear
9789 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009790
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 n = PyList_GET_SIZE(keys);
9792 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009793 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009794 for (i = 0; i < n; i++) {
9795 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9796 switch (s->state) {
9797 case SSTATE_NOT_INTERNED:
9798 /* XXX Shouldn't happen */
9799 break;
9800 case SSTATE_INTERNED_IMMORTAL:
9801 Py_REFCNT(s) += 1;
9802 immortal_size += s->length;
9803 break;
9804 case SSTATE_INTERNED_MORTAL:
9805 Py_REFCNT(s) += 2;
9806 mortal_size += s->length;
9807 break;
9808 default:
9809 Py_FatalError("Inconsistent interned string state.");
9810 }
9811 s->state = SSTATE_NOT_INTERNED;
9812 }
9813 fprintf(stderr, "total size of all interned strings: "
9814 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9815 "mortal/immortal\n", mortal_size, immortal_size);
9816 Py_DECREF(keys);
9817 PyDict_Clear(interned);
9818 Py_DECREF(interned);
9819 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009820}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009821
9822
9823/********************* Unicode Iterator **************************/
9824
9825typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826 PyObject_HEAD
9827 Py_ssize_t it_index;
9828 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009829} unicodeiterobject;
9830
9831static void
9832unicodeiter_dealloc(unicodeiterobject *it)
9833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 _PyObject_GC_UNTRACK(it);
9835 Py_XDECREF(it->it_seq);
9836 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009837}
9838
9839static int
9840unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 Py_VISIT(it->it_seq);
9843 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009844}
9845
9846static PyObject *
9847unicodeiter_next(unicodeiterobject *it)
9848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009849 PyUnicodeObject *seq;
9850 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009851
Benjamin Peterson14339b62009-01-31 16:36:08 +00009852 assert(it != NULL);
9853 seq = it->it_seq;
9854 if (seq == NULL)
9855 return NULL;
9856 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009857
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9859 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 if (item != NULL)
9862 ++it->it_index;
9863 return item;
9864 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009865
Benjamin Peterson14339b62009-01-31 16:36:08 +00009866 Py_DECREF(seq);
9867 it->it_seq = NULL;
9868 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009869}
9870
9871static PyObject *
9872unicodeiter_len(unicodeiterobject *it)
9873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009874 Py_ssize_t len = 0;
9875 if (it->it_seq)
9876 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9877 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009878}
9879
9880PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9881
9882static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009883 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009886};
9887
9888PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009889 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9890 "str_iterator", /* tp_name */
9891 sizeof(unicodeiterobject), /* tp_basicsize */
9892 0, /* tp_itemsize */
9893 /* methods */
9894 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9895 0, /* tp_print */
9896 0, /* tp_getattr */
9897 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009898 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 0, /* tp_repr */
9900 0, /* tp_as_number */
9901 0, /* tp_as_sequence */
9902 0, /* tp_as_mapping */
9903 0, /* tp_hash */
9904 0, /* tp_call */
9905 0, /* tp_str */
9906 PyObject_GenericGetAttr, /* tp_getattro */
9907 0, /* tp_setattro */
9908 0, /* tp_as_buffer */
9909 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9910 0, /* tp_doc */
9911 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9912 0, /* tp_clear */
9913 0, /* tp_richcompare */
9914 0, /* tp_weaklistoffset */
9915 PyObject_SelfIter, /* tp_iter */
9916 (iternextfunc)unicodeiter_next, /* tp_iternext */
9917 unicodeiter_methods, /* tp_methods */
9918 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009919};
9920
9921static PyObject *
9922unicode_iter(PyObject *seq)
9923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009925
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 if (!PyUnicode_Check(seq)) {
9927 PyErr_BadInternalCall();
9928 return NULL;
9929 }
9930 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9931 if (it == NULL)
9932 return NULL;
9933 it->it_index = 0;
9934 Py_INCREF(seq);
9935 it->it_seq = (PyUnicodeObject *)seq;
9936 _PyObject_GC_TRACK(it);
9937 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009938}
9939
Martin v. Löwis5b222132007-06-10 09:51:05 +00009940size_t
9941Py_UNICODE_strlen(const Py_UNICODE *u)
9942{
9943 int res = 0;
9944 while(*u++)
9945 res++;
9946 return res;
9947}
9948
9949Py_UNICODE*
9950Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9951{
9952 Py_UNICODE *u = s1;
9953 while ((*u++ = *s2++));
9954 return s1;
9955}
9956
9957Py_UNICODE*
9958Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9959{
9960 Py_UNICODE *u = s1;
9961 while ((*u++ = *s2++))
9962 if (n-- == 0)
9963 break;
9964 return s1;
9965}
9966
9967int
9968Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9969{
9970 while (*s1 && *s2 && *s1 == *s2)
9971 s1++, s2++;
9972 if (*s1 && *s2)
9973 return (*s1 < *s2) ? -1 : +1;
9974 if (*s1)
9975 return 1;
9976 if (*s2)
9977 return -1;
9978 return 0;
9979}
9980
9981Py_UNICODE*
9982Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9983{
9984 const Py_UNICODE *p;
9985 for (p = s; *p; p++)
9986 if (*p == c)
9987 return (Py_UNICODE*)p;
9988 return NULL;
9989}
9990
9991
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009992#ifdef __cplusplus
9993}
9994#endif
9995
9996
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009997/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 Local variables:
9999 c-basic-offset: 4
10000 indent-tabs-mode: nil
10001 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010002*/