blob: 3bd1efd9392f91f72700155439ae8a81f0fb91db [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000794 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* Convert the argument to a bytes object, according to the file
1534 system encoding */
1535
1536int
1537PyUnicode_FSConverter(PyObject* arg, void* addr)
1538{
1539 PyObject *output = NULL;
1540 Py_ssize_t size;
1541 void *data;
1542 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1543 output = arg;
1544 Py_INCREF(output);
1545 }
1546 else {
1547 arg = PyUnicode_FromObject(arg);
1548 if (!arg)
1549 return 0;
1550 output = PyUnicode_AsEncodedObject(arg,
1551 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001552 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001553 Py_DECREF(arg);
1554 if (!output)
1555 return 0;
1556 if (!PyBytes_Check(output)) {
1557 Py_DECREF(output);
1558 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1559 return 0;
1560 }
1561 }
1562 if (PyBytes_Check(output)) {
1563 size = PyBytes_GET_SIZE(output);
1564 data = PyBytes_AS_STRING(output);
1565 }
1566 else {
1567 size = PyByteArray_GET_SIZE(output);
1568 data = PyByteArray_AS_STRING(output);
1569 }
1570 if (size != strlen(data)) {
1571 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1572 Py_DECREF(output);
1573 return 0;
1574 }
1575 *(PyObject**)addr = output;
1576 return 1;
1577}
1578
1579
Martin v. Löwis5b222132007-06-10 09:51:05 +00001580char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001581_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001582{
Christian Heimesf3863112007-11-22 07:46:41 +00001583 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001584 if (!PyUnicode_Check(unicode)) {
1585 PyErr_BadArgument();
1586 return NULL;
1587 }
Christian Heimesf3863112007-11-22 07:46:41 +00001588 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1589 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001590 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001591 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001592 *psize = PyBytes_GET_SIZE(bytes);
1593 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001594}
1595
1596char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001597_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001599 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001600}
1601
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1603{
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 goto onError;
1607 }
1608 return PyUnicode_AS_UNICODE(unicode);
1609
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 return NULL;
1612}
1613
Martin v. Löwis18e16552006-02-15 17:27:45 +00001614Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 goto onError;
1619 }
1620 return PyUnicode_GET_SIZE(unicode);
1621
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return -1;
1624}
1625
Thomas Wouters78890102000-07-22 19:25:51 +00001626const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001627{
1628 return unicode_default_encoding;
1629}
1630
1631int PyUnicode_SetDefaultEncoding(const char *encoding)
1632{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001633 if (strcmp(encoding, unicode_default_encoding) != 0) {
1634 PyErr_Format(PyExc_ValueError,
1635 "Can only set default encoding to %s",
1636 unicode_default_encoding);
1637 return -1;
1638 }
Fred Drakee4315f52000-05-09 19:53:39 +00001639 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001640}
1641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642/* error handling callback helper:
1643 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001644 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001645 and adjust various state variables.
1646 return 0 on success, -1 on error
1647*/
1648
1649static
1650int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 const char *encoding, const char *reason,
1652 const char **input, const char **inend, Py_ssize_t *startinpos,
1653 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1654 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001656 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657
1658 PyObject *restuple = NULL;
1659 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001660 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001661 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001662 Py_ssize_t requiredsize;
1663 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001665 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 int res = -1;
1668
1669 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 *errorHandler = PyCodec_LookupError(errors);
1671 if (*errorHandler == NULL)
1672 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 }
1674
1675 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001676 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1678 if (*exceptionObject == NULL)
1679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 }
1681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001682 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1683 goto onError;
1684 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1685 goto onError;
1686 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 }
1689
1690 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1691 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001692 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001694 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001696 }
1697 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001698 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001699
1700 /* Copy back the bytes variables, which might have been modified by the
1701 callback */
1702 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1703 if (!inputobj)
1704 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001705 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001706 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001707 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001708 *input = PyBytes_AS_STRING(inputobj);
1709 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001710 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001711 /* we can DECREF safely, as the exception has another reference,
1712 so the object won't go away. */
1713 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001716 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001717 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001718 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1719 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721
1722 /* need more space? (at least enough for what we
1723 have+the replacement+the rest of the string (starting
1724 at the new input position), so we won't have to check space
1725 when there are no errors in the rest of the string) */
1726 repptr = PyUnicode_AS_UNICODE(repunicode);
1727 repsize = PyUnicode_GET_SIZE(repunicode);
1728 requiredsize = *outpos + repsize + insize-newpos;
1729 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001730 if (requiredsize<2*outsize)
1731 requiredsize = 2*outsize;
1732 if (_PyUnicode_Resize(output, requiredsize) < 0)
1733 goto onError;
1734 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 }
1736 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001737 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 Py_UNICODE_COPY(*outptr, repptr, repsize);
1739 *outptr += repsize;
1740 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 /* we made it! */
1743 res = 0;
1744
Benjamin Peterson29060642009-01-31 22:14:21 +00001745 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 Py_XDECREF(restuple);
1747 return res;
1748}
1749
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750/* --- UTF-7 Codec -------------------------------------------------------- */
1751
Antoine Pitrou244651a2009-05-04 18:56:13 +00001752/* See RFC2152 for details. We encode conservatively and decode liberally. */
1753
1754/* Three simple macros defining base-64. */
1755
1756/* Is c a base-64 character? */
1757
1758#define IS_BASE64(c) \
1759 (((c) >= 'A' && (c) <= 'Z') || \
1760 ((c) >= 'a' && (c) <= 'z') || \
1761 ((c) >= '0' && (c) <= '9') || \
1762 (c) == '+' || (c) == '/')
1763
1764/* given that c is a base-64 character, what is its base-64 value? */
1765
1766#define FROM_BASE64(c) \
1767 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1768 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1769 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1770 (c) == '+' ? 62 : 63)
1771
1772/* What is the base-64 character of the bottom 6 bits of n? */
1773
1774#define TO_BASE64(n) \
1775 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1776
1777/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1778 * decoded as itself. We are permissive on decoding; the only ASCII
1779 * byte not decoding to itself is the + which begins a base64
1780 * string. */
1781
1782#define DECODE_DIRECT(c) \
1783 ((c) <= 127 && (c) != '+')
1784
1785/* The UTF-7 encoder treats ASCII characters differently according to
1786 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1787 * the above). See RFC2152. This array identifies these different
1788 * sets:
1789 * 0 : "Set D"
1790 * alphanumeric and '(),-./:?
1791 * 1 : "Set O"
1792 * !"#$%&*;<=>@[]^_`{|}
1793 * 2 : "whitespace"
1794 * ht nl cr sp
1795 * 3 : special (must be base64 encoded)
1796 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1797 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798
Tim Petersced69f82003-09-16 20:30:58 +00001799static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001800char utf7_category[128] = {
1801/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1802 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1803/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1804 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1805/* sp ! " # $ % & ' ( ) * + , - . / */
1806 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1807/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1809/* @ A B C D E F G H I J K L M N O */
1810 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1811/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1813/* ` a b c d e f g h i j k l m n o */
1814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1815/* p q r s t u v w x y z { | } ~ del */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817};
1818
Antoine Pitrou244651a2009-05-04 18:56:13 +00001819/* ENCODE_DIRECT: this character should be encoded as itself. The
1820 * answer depends on whether we are encoding set O as itself, and also
1821 * on whether we are encoding whitespace as itself. RFC2152 makes it
1822 * clear that the answers to these questions vary between
1823 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001824
Antoine Pitrou244651a2009-05-04 18:56:13 +00001825#define ENCODE_DIRECT(c, directO, directWS) \
1826 ((c) < 128 && (c) > 0 && \
1827 ((utf7_category[(c)] == 0) || \
1828 (directWS && (utf7_category[(c)] == 2)) || \
1829 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001831PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 Py_ssize_t size,
1833 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001835 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1836}
1837
Antoine Pitrou244651a2009-05-04 18:56:13 +00001838/* The decoder. The only state we preserve is our read position,
1839 * i.e. how many characters we have consumed. So if we end in the
1840 * middle of a shift sequence we have to back off the read position
1841 * and the output to the beginning of the sequence, otherwise we lose
1842 * all the shift state (seen bits, number of bits seen, high
1843 * surrogate). */
1844
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001845PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 Py_ssize_t size,
1847 const char *errors,
1848 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001851 Py_ssize_t startinpos;
1852 Py_ssize_t endinpos;
1853 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854 const char *e;
1855 PyUnicodeObject *unicode;
1856 Py_UNICODE *p;
1857 const char *errmsg = "";
1858 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001859 Py_UNICODE *shiftOutStart;
1860 unsigned int base64bits = 0;
1861 unsigned long base64buffer = 0;
1862 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 PyObject *errorHandler = NULL;
1864 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865
1866 unicode = _PyUnicode_New(size);
1867 if (!unicode)
1868 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001869 if (size == 0) {
1870 if (consumed)
1871 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001873 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001874
1875 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001876 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 e = s + size;
1878
1879 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001882 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883
Antoine Pitrou244651a2009-05-04 18:56:13 +00001884 if (inShift) { /* in a base-64 section */
1885 if (IS_BASE64(ch)) { /* consume a base-64 character */
1886 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1887 base64bits += 6;
1888 s++;
1889 if (base64bits >= 16) {
1890 /* we have enough bits for a UTF-16 value */
1891 Py_UNICODE outCh = (Py_UNICODE)
1892 (base64buffer >> (base64bits-16));
1893 base64bits -= 16;
1894 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1895 if (surrogate) {
1896 /* expecting a second surrogate */
1897 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1898#ifdef Py_UNICODE_WIDE
1899 *p++ = (((surrogate & 0x3FF)<<10)
1900 | (outCh & 0x3FF)) + 0x10000;
1901#else
1902 *p++ = surrogate;
1903 *p++ = outCh;
1904#endif
1905 surrogate = 0;
1906 }
1907 else {
1908 surrogate = 0;
1909 errmsg = "second surrogate missing";
1910 goto utf7Error;
1911 }
1912 }
1913 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1914 /* first surrogate */
1915 surrogate = outCh;
1916 }
1917 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1918 errmsg = "unexpected second surrogate";
1919 goto utf7Error;
1920 }
1921 else {
1922 *p++ = outCh;
1923 }
1924 }
1925 }
1926 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001927 inShift = 0;
1928 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001929 if (surrogate) {
1930 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001931 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001933 if (base64bits > 0) { /* left-over bits */
1934 if (base64bits >= 6) {
1935 /* We've seen at least one base-64 character */
1936 errmsg = "partial character in shift sequence";
1937 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001938 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001939 else {
1940 /* Some bits remain; they should be zero */
1941 if (base64buffer != 0) {
1942 errmsg = "non-zero padding bits in shift sequence";
1943 goto utf7Error;
1944 }
1945 }
1946 }
1947 if (ch != '-') {
1948 /* '-' is absorbed; other terminating
1949 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950 *p++ = ch;
1951 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952 }
1953 }
1954 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001956 s++; /* consume '+' */
1957 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001958 s++;
1959 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960 }
1961 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001963 shiftOutStart = p;
1964 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965 }
1966 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968 *p++ = ch;
1969 s++;
1970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001971 else {
1972 startinpos = s-starts;
1973 s++;
1974 errmsg = "unexpected special character";
1975 goto utf7Error;
1976 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001977 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001978utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 outpos = p-PyUnicode_AS_UNICODE(unicode);
1980 endinpos = s-starts;
1981 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001982 errors, &errorHandler,
1983 "utf7", errmsg,
1984 &starts, &e, &startinpos, &endinpos, &exc, &s,
1985 &unicode, &outpos, &p))
1986 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987 }
1988
Antoine Pitrou244651a2009-05-04 18:56:13 +00001989 /* end of string */
1990
1991 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1992 /* if we're in an inconsistent state, that's an error */
1993 if (surrogate ||
1994 (base64bits >= 6) ||
1995 (base64bits > 0 && base64buffer != 0)) {
1996 outpos = p-PyUnicode_AS_UNICODE(unicode);
1997 endinpos = size;
1998 if (unicode_decode_call_errorhandler(
1999 errors, &errorHandler,
2000 "utf7", "unterminated shift sequence",
2001 &starts, &e, &startinpos, &endinpos, &exc, &s,
2002 &unicode, &outpos, &p))
2003 goto onError;
2004 if (s < e)
2005 goto restart;
2006 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002007 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002008
2009 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002010 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002011 if (inShift) {
2012 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002013 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014 }
2015 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002016 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002017 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002018 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002019
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002020 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 goto onError;
2022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 Py_XDECREF(errorHandler);
2024 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 return (PyObject *)unicode;
2026
Benjamin Peterson29060642009-01-31 22:14:21 +00002027 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 Py_XDECREF(errorHandler);
2029 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030 Py_DECREF(unicode);
2031 return NULL;
2032}
2033
2034
2035PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002036 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002037 int base64SetO,
2038 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002041 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002042 /* It might be possible to tighten this worst case */
Antoine Pitrou244651a2009-05-04 18:56:13 +00002043 Py_ssize_t allocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002045 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002046 unsigned int base64bits = 0;
2047 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 char * out;
2049 char * start;
2050
2051 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053
Antoine Pitrou244651a2009-05-04 18:56:13 +00002054 if (allocated / 5 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002055 return PyErr_NoMemory();
2056
Antoine Pitrou244651a2009-05-04 18:56:13 +00002057 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 if (v == NULL)
2059 return NULL;
2060
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002061 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 for (;i < size; ++i) {
2063 Py_UNICODE ch = s[i];
2064
Antoine Pitrou244651a2009-05-04 18:56:13 +00002065 if (inShift) {
2066 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2067 /* shifting out */
2068 if (base64bits) { /* output remaining bits */
2069 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2070 base64buffer = 0;
2071 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 }
2073 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 /* Characters not in the BASE64 set implicitly unshift the sequence
2075 so no '-' is required, except if the character is itself a '-' */
2076 if (IS_BASE64(ch) || ch == '-') {
2077 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002079 *out++ = (char) ch;
2080 }
2081 else {
2082 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002084 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 else { /* not in a shift sequence */
2086 if (ch == '+') {
2087 *out++ = '+';
2088 *out++ = '-';
2089 }
2090 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2091 *out++ = (char) ch;
2092 }
2093 else {
2094 *out++ = '+';
2095 inShift = 1;
2096 goto encode_char;
2097 }
2098 }
2099 continue;
2100encode_char:
2101#ifdef Py_UNICODE_WIDE
2102 if (ch >= 0x10000) {
2103 /* code first surrogate */
2104 base64bits += 16;
2105 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2106 while (base64bits >= 6) {
2107 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2108 base64bits -= 6;
2109 }
2110 /* prepare second surrogate */
2111 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2112 }
2113#endif
2114 base64bits += 16;
2115 base64buffer = (base64buffer << 16) | ch;
2116 while (base64bits >= 6) {
2117 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2118 base64bits -= 6;
2119 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002120 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002121 if (base64bits)
2122 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2123 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002125 if (_PyBytes_Resize(&v, out - start) < 0)
2126 return NULL;
2127 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128}
2129
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130#undef IS_BASE64
2131#undef FROM_BASE64
2132#undef TO_BASE64
2133#undef DECODE_DIRECT
2134#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136/* --- UTF-8 Codec -------------------------------------------------------- */
2137
Tim Petersced69f82003-09-16 20:30:58 +00002138static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139char utf8_code_length[256] = {
2140 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2141 illegal prefix. see RFC 2279 for details */
2142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2152 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2154 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2155 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2156 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2157 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2158};
2159
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 Py_ssize_t size,
2162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163{
Walter Dörwald69652032004-09-07 20:24:22 +00002164 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2165}
2166
Antoine Pitrouab868312009-01-10 15:40:25 +00002167/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2168#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2169
2170/* Mask to quickly check whether a C 'long' contains a
2171 non-ASCII, UTF8-encoded char. */
2172#if (SIZEOF_LONG == 8)
2173# define ASCII_CHAR_MASK 0x8080808080808080L
2174#elif (SIZEOF_LONG == 4)
2175# define ASCII_CHAR_MASK 0x80808080L
2176#else
2177# error C 'long' size should be either 4 or 8!
2178#endif
2179
Walter Dörwald69652032004-09-07 20:24:22 +00002180PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002181 Py_ssize_t size,
2182 const char *errors,
2183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002185 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002187 Py_ssize_t startinpos;
2188 Py_ssize_t endinpos;
2189 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002190 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 PyUnicodeObject *unicode;
2192 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002193 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002194 PyObject *errorHandler = NULL;
2195 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
2197 /* Note: size will always be longer than the resulting Unicode
2198 character count */
2199 unicode = _PyUnicode_New(size);
2200 if (!unicode)
2201 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002202 if (size == 0) {
2203 if (consumed)
2204 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207
2208 /* Unpack UTF-8 encoded data */
2209 p = unicode->str;
2210 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002211 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
2213 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002217 /* Fast path for runs of ASCII characters. Given that common UTF-8
2218 input will consist of an overwhelming majority of ASCII
2219 characters, we try to optimize for this case by checking
2220 as many characters as a C 'long' can contain.
2221 First, check if we can do an aligned read, as most CPUs have
2222 a penalty for unaligned reads.
2223 */
2224 if (!((size_t) s & LONG_PTR_MASK)) {
2225 /* Help register allocation */
2226 register const char *_s = s;
2227 register Py_UNICODE *_p = p;
2228 while (_s < aligned_end) {
2229 /* Read a whole long at a time (either 4 or 8 bytes),
2230 and do a fast unrolled copy if it only contains ASCII
2231 characters. */
2232 unsigned long data = *(unsigned long *) _s;
2233 if (data & ASCII_CHAR_MASK)
2234 break;
2235 _p[0] = (unsigned char) _s[0];
2236 _p[1] = (unsigned char) _s[1];
2237 _p[2] = (unsigned char) _s[2];
2238 _p[3] = (unsigned char) _s[3];
2239#if (SIZEOF_LONG == 8)
2240 _p[4] = (unsigned char) _s[4];
2241 _p[5] = (unsigned char) _s[5];
2242 _p[6] = (unsigned char) _s[6];
2243 _p[7] = (unsigned char) _s[7];
2244#endif
2245 _s += SIZEOF_LONG;
2246 _p += SIZEOF_LONG;
2247 }
2248 s = _s;
2249 p = _p;
2250 if (s == e)
2251 break;
2252 ch = (unsigned char)*s;
2253 }
2254 }
2255
2256 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002257 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 s++;
2259 continue;
2260 }
2261
2262 n = utf8_code_length[ch];
2263
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002264 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 if (consumed)
2266 break;
2267 else {
2268 errmsg = "unexpected end of data";
2269 startinpos = s-starts;
2270 endinpos = size;
2271 goto utf8Error;
2272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274
2275 switch (n) {
2276
2277 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002278 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002279 startinpos = s-starts;
2280 endinpos = startinpos+1;
2281 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282
2283 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002284 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 startinpos = s-starts;
2286 endinpos = startinpos+1;
2287 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288
2289 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002290 if ((s[1] & 0xc0) != 0x80) {
2291 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 startinpos = s-starts;
2293 endinpos = startinpos+2;
2294 goto utf8Error;
2295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002297 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 startinpos = s-starts;
2299 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002300 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 goto utf8Error;
2302 }
2303 else
2304 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 break;
2306
2307 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002308 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002309 (s[2] & 0xc0) != 0x80) {
2310 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002311 startinpos = s-starts;
2312 endinpos = startinpos+3;
2313 goto utf8Error;
2314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002316 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002317 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002318 startinpos = s-starts;
2319 endinpos = startinpos+3;
2320 goto utf8Error;
2321 }
2322 else
2323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002324 break;
2325
2326 case 4:
2327 if ((s[1] & 0xc0) != 0x80 ||
2328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002329 (s[3] & 0xc0) != 0x80) {
2330 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 startinpos = s-starts;
2332 endinpos = startinpos+4;
2333 goto utf8Error;
2334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002338 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002340 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002341 UTF-16 */
2342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002343 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002344 startinpos = s-starts;
2345 endinpos = startinpos+4;
2346 goto utf8Error;
2347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002348#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002353 /* translate from 10000..10FFFF to 0..FFFF */
2354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002356 /* high surrogate = top 10 bits added to D800 */
2357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 break;
2363
2364 default:
2365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002366 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 startinpos = s-starts;
2368 endinpos = startinpos+n;
2369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 }
2371 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002373
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 utf8Error:
2375 outpos = p-PyUnicode_AS_UNICODE(unicode);
2376 if (unicode_decode_call_errorhandler(
2377 errors, &errorHandler,
2378 "utf8", errmsg,
2379 &starts, &e, &startinpos, &endinpos, &exc, &s,
2380 &unicode, &outpos, &p))
2381 goto onError;
2382 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 }
Walter Dörwald69652032004-09-07 20:24:22 +00002384 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002388 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 goto onError;
2390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 Py_XDECREF(errorHandler);
2392 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 return (PyObject *)unicode;
2394
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 Py_XDECREF(errorHandler);
2397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 Py_DECREF(unicode);
2399 return NULL;
2400}
2401
Antoine Pitrouab868312009-01-10 15:40:25 +00002402#undef ASCII_CHAR_MASK
2403
2404
Tim Peters602f7402002-04-27 18:03:26 +00002405/* Allocation strategy: if the string is short, convert into a stack buffer
2406 and allocate exactly as much space needed at the end. Else allocate the
2407 maximum possible needed (4 result bytes per Unicode character), and return
2408 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002409*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002410PyObject *
2411PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 Py_ssize_t size,
2413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414{
Tim Peters602f7402002-04-27 18:03:26 +00002415#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002416
Guido van Rossum98297ee2007-11-06 21:34:58 +00002417 Py_ssize_t i; /* index into s of next input byte */
2418 PyObject *result; /* result string object */
2419 char *p; /* next free byte in output buffer */
2420 Py_ssize_t nallocated; /* number of result bytes allocated */
2421 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002422 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002423 PyObject *errorHandler = NULL;
2424 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002425
Tim Peters602f7402002-04-27 18:03:26 +00002426 assert(s != NULL);
2427 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428
Tim Peters602f7402002-04-27 18:03:26 +00002429 if (size <= MAX_SHORT_UNICHARS) {
2430 /* Write into the stack buffer; nallocated can't overflow.
2431 * At the end, we'll allocate exactly as much heap space as it
2432 * turns out we need.
2433 */
2434 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002435 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002436 p = stackbuf;
2437 }
2438 else {
2439 /* Overallocate on the heap, and give the excess back at the end. */
2440 nallocated = size * 4;
2441 if (nallocated / 4 != size) /* overflow! */
2442 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002443 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002444 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002445 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002446 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002447 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002448
Tim Peters602f7402002-04-27 18:03:26 +00002449 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002450 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002451
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002452 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002453 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002457 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002458 *p++ = (char)(0xc0 | (ch >> 6));
2459 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002460 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002461 else {
Tim Peters602f7402002-04-27 18:03:26 +00002462 /* Encode UCS2 Unicode ordinals */
2463 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002464#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002465 /* Special case: check for high surrogate */
2466 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2467 Py_UCS4 ch2 = s[i];
2468 /* Check for low surrogate and combine the two to
2469 form a UCS4 value */
2470 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002471 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002472 i++;
2473 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002474 }
Tim Peters602f7402002-04-27 18:03:26 +00002475 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002476 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002477#endif
2478 if (ch >= 0xd800 && ch <= 0xdfff) {
2479 Py_ssize_t newpos;
2480 PyObject *rep;
2481 char *prep;
2482 int k;
2483 rep = unicode_encode_call_errorhandler
2484 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2485 s, size, &exc, i-1, i, &newpos);
2486 if (!rep)
2487 goto error;
2488 /* Implementation limitations: only support error handler that return
2489 bytes, and only support up to four replacement bytes. */
2490 if (!PyBytes_Check(rep)) {
2491 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2492 Py_DECREF(rep);
2493 goto error;
2494 }
2495 if (PyBytes_Size(rep) > 4) {
2496 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2497 Py_DECREF(rep);
2498 goto error;
2499 }
2500 prep = PyBytes_AsString(rep);
2501 for(k = PyBytes_Size(rep); k > 0; k--)
2502 *p++ = *prep++;
2503 Py_DECREF(rep);
2504 continue;
2505
2506 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002507 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002508 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2509 *p++ = (char)(0x80 | (ch & 0x3f));
2510 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002513 /* Encode UCS4 Unicode ordinals */
2514 *p++ = (char)(0xf0 | (ch >> 18));
2515 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2516 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2517 *p++ = (char)(0x80 | (ch & 0x3f));
2518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002520
Guido van Rossum98297ee2007-11-06 21:34:58 +00002521 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002522 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002523 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002524 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002525 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002526 }
2527 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002528 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002529 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002530 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002533 Py_XDECREF(errorHandler);
2534 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002535 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002536 error:
2537 Py_XDECREF(errorHandler);
2538 Py_XDECREF(exc);
2539 Py_XDECREF(result);
2540 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002541
Tim Peters602f7402002-04-27 18:03:26 +00002542#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543}
2544
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (!PyUnicode_Check(unicode)) {
2548 PyErr_BadArgument();
2549 return NULL;
2550 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002551 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002552 PyUnicode_GET_SIZE(unicode),
2553 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554}
2555
Walter Dörwald41980ca2007-08-16 21:55:45 +00002556/* --- UTF-32 Codec ------------------------------------------------------- */
2557
2558PyObject *
2559PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 Py_ssize_t size,
2561 const char *errors,
2562 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002563{
2564 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2565}
2566
2567PyObject *
2568PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002569 Py_ssize_t size,
2570 const char *errors,
2571 int *byteorder,
2572 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002573{
2574 const char *starts = s;
2575 Py_ssize_t startinpos;
2576 Py_ssize_t endinpos;
2577 Py_ssize_t outpos;
2578 PyUnicodeObject *unicode;
2579 Py_UNICODE *p;
2580#ifndef Py_UNICODE_WIDE
2581 int i, pairs;
2582#else
2583 const int pairs = 0;
2584#endif
2585 const unsigned char *q, *e;
2586 int bo = 0; /* assume native ordering by default */
2587 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002588 /* Offsets from q for retrieving bytes in the right order. */
2589#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590 int iorder[] = {0, 1, 2, 3};
2591#else
2592 int iorder[] = {3, 2, 1, 0};
2593#endif
2594 PyObject *errorHandler = NULL;
2595 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002596 /* On narrow builds we split characters outside the BMP into two
2597 codepoints => count how much extra space we need. */
2598#ifndef Py_UNICODE_WIDE
2599 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 if (((Py_UCS4 *)s)[i] >= 0x10000)
2601 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002602#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002603
2604 /* This might be one to much, because of a BOM */
2605 unicode = _PyUnicode_New((size+3)/4+pairs);
2606 if (!unicode)
2607 return NULL;
2608 if (size == 0)
2609 return (PyObject *)unicode;
2610
2611 /* Unpack UTF-32 encoded data */
2612 p = unicode->str;
2613 q = (unsigned char *)s;
2614 e = q + size;
2615
2616 if (byteorder)
2617 bo = *byteorder;
2618
2619 /* Check for BOM marks (U+FEFF) in the input and adjust current
2620 byte order setting accordingly. In native mode, the leading BOM
2621 mark is skipped, in all other modes, it is copied to the output
2622 stream as-is (giving a ZWNBSP character). */
2623 if (bo == 0) {
2624 if (size >= 4) {
2625 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002627#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 if (bom == 0x0000FEFF) {
2629 q += 4;
2630 bo = -1;
2631 }
2632 else if (bom == 0xFFFE0000) {
2633 q += 4;
2634 bo = 1;
2635 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002636#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002637 if (bom == 0x0000FEFF) {
2638 q += 4;
2639 bo = 1;
2640 }
2641 else if (bom == 0xFFFE0000) {
2642 q += 4;
2643 bo = -1;
2644 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002645#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002646 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647 }
2648
2649 if (bo == -1) {
2650 /* force LE */
2651 iorder[0] = 0;
2652 iorder[1] = 1;
2653 iorder[2] = 2;
2654 iorder[3] = 3;
2655 }
2656 else if (bo == 1) {
2657 /* force BE */
2658 iorder[0] = 3;
2659 iorder[1] = 2;
2660 iorder[2] = 1;
2661 iorder[3] = 0;
2662 }
2663
2664 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 Py_UCS4 ch;
2666 /* remaining bytes at the end? (size should be divisible by 4) */
2667 if (e-q<4) {
2668 if (consumed)
2669 break;
2670 errmsg = "truncated data";
2671 startinpos = ((const char *)q)-starts;
2672 endinpos = ((const char *)e)-starts;
2673 goto utf32Error;
2674 /* The remaining input chars are ignored if the callback
2675 chooses to skip the input */
2676 }
2677 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2678 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002679
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 if (ch >= 0x110000)
2681 {
2682 errmsg = "codepoint not in range(0x110000)";
2683 startinpos = ((const char *)q)-starts;
2684 endinpos = startinpos+4;
2685 goto utf32Error;
2686 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002687#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 if (ch >= 0x10000)
2689 {
2690 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2691 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 }
2693 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002694#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002695 *p++ = ch;
2696 q += 4;
2697 continue;
2698 utf32Error:
2699 outpos = p-PyUnicode_AS_UNICODE(unicode);
2700 if (unicode_decode_call_errorhandler(
2701 errors, &errorHandler,
2702 "utf32", errmsg,
2703 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2704 &unicode, &outpos, &p))
2705 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002706 }
2707
2708 if (byteorder)
2709 *byteorder = bo;
2710
2711 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002713
2714 /* Adjust length */
2715 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2716 goto onError;
2717
2718 Py_XDECREF(errorHandler);
2719 Py_XDECREF(exc);
2720 return (PyObject *)unicode;
2721
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002723 Py_DECREF(unicode);
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return NULL;
2727}
2728
2729PyObject *
2730PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 Py_ssize_t size,
2732 const char *errors,
2733 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002735 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002736 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002737 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002738#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002739 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740#else
2741 const int pairs = 0;
2742#endif
2743 /* Offsets from p for storing byte pairs in the right order. */
2744#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2745 int iorder[] = {0, 1, 2, 3};
2746#else
2747 int iorder[] = {3, 2, 1, 0};
2748#endif
2749
Benjamin Peterson29060642009-01-31 22:14:21 +00002750#define STORECHAR(CH) \
2751 do { \
2752 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2753 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2754 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2755 p[iorder[0]] = (CH) & 0xff; \
2756 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002757 } while(0)
2758
2759 /* In narrow builds we can output surrogate pairs as one codepoint,
2760 so we need less space. */
2761#ifndef Py_UNICODE_WIDE
2762 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2764 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2765 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002767 nsize = (size - pairs + (byteorder == 0));
2768 bytesize = nsize * 4;
2769 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002771 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772 if (v == NULL)
2773 return NULL;
2774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002775 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002779 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780
2781 if (byteorder == -1) {
2782 /* force LE */
2783 iorder[0] = 0;
2784 iorder[1] = 1;
2785 iorder[2] = 2;
2786 iorder[3] = 3;
2787 }
2788 else if (byteorder == 1) {
2789 /* force BE */
2790 iorder[0] = 3;
2791 iorder[1] = 2;
2792 iorder[2] = 1;
2793 iorder[3] = 0;
2794 }
2795
2796 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002798#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2800 Py_UCS4 ch2 = *s;
2801 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2802 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2803 s++;
2804 size--;
2805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807#endif
2808 STORECHAR(ch);
2809 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002810
2811 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002812 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002813#undef STORECHAR
2814}
2815
2816PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2817{
2818 if (!PyUnicode_Check(unicode)) {
2819 PyErr_BadArgument();
2820 return NULL;
2821 }
2822 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 PyUnicode_GET_SIZE(unicode),
2824 NULL,
2825 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002826}
2827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828/* --- UTF-16 Codec ------------------------------------------------------- */
2829
Tim Peters772747b2001-08-09 22:21:55 +00002830PyObject *
2831PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_ssize_t size,
2833 const char *errors,
2834 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835{
Walter Dörwald69652032004-09-07 20:24:22 +00002836 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2837}
2838
Antoine Pitrouab868312009-01-10 15:40:25 +00002839/* Two masks for fast checking of whether a C 'long' may contain
2840 UTF16-encoded surrogate characters. This is an efficient heuristic,
2841 assuming that non-surrogate characters with a code point >= 0x8000 are
2842 rare in most input.
2843 FAST_CHAR_MASK is used when the input is in native byte ordering,
2844 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002845*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002846#if (SIZEOF_LONG == 8)
2847# define FAST_CHAR_MASK 0x8000800080008000L
2848# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2849#elif (SIZEOF_LONG == 4)
2850# define FAST_CHAR_MASK 0x80008000L
2851# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2852#else
2853# error C 'long' size should be either 4 or 8!
2854#endif
2855
Walter Dörwald69652032004-09-07 20:24:22 +00002856PyObject *
2857PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 Py_ssize_t size,
2859 const char *errors,
2860 int *byteorder,
2861 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t startinpos;
2865 Py_ssize_t endinpos;
2866 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 PyUnicodeObject *unicode;
2868 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002869 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002870 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002871 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002872 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002873 /* Offsets from q for retrieving byte pairs in the right order. */
2874#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2875 int ihi = 1, ilo = 0;
2876#else
2877 int ihi = 0, ilo = 1;
2878#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 PyObject *errorHandler = NULL;
2880 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881
2882 /* Note: size will always be longer than the resulting Unicode
2883 character count */
2884 unicode = _PyUnicode_New(size);
2885 if (!unicode)
2886 return NULL;
2887 if (size == 0)
2888 return (PyObject *)unicode;
2889
2890 /* Unpack UTF-16 encoded data */
2891 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002892 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002893 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894
2895 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002896 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002898 /* Check for BOM marks (U+FEFF) in the input and adjust current
2899 byte order setting accordingly. In native mode, the leading BOM
2900 mark is skipped, in all other modes, it is copied to the output
2901 stream as-is (giving a ZWNBSP character). */
2902 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002903 if (size >= 2) {
2904 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002905#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 if (bom == 0xFEFF) {
2907 q += 2;
2908 bo = -1;
2909 }
2910 else if (bom == 0xFFFE) {
2911 q += 2;
2912 bo = 1;
2913 }
Tim Petersced69f82003-09-16 20:30:58 +00002914#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 if (bom == 0xFEFF) {
2916 q += 2;
2917 bo = 1;
2918 }
2919 else if (bom == 0xFFFE) {
2920 q += 2;
2921 bo = -1;
2922 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002923#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926
Tim Peters772747b2001-08-09 22:21:55 +00002927 if (bo == -1) {
2928 /* force LE */
2929 ihi = 1;
2930 ilo = 0;
2931 }
2932 else if (bo == 1) {
2933 /* force BE */
2934 ihi = 0;
2935 ilo = 1;
2936 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002937#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2938 native_ordering = ilo < ihi;
2939#else
2940 native_ordering = ilo > ihi;
2941#endif
Tim Peters772747b2001-08-09 22:21:55 +00002942
Antoine Pitrouab868312009-01-10 15:40:25 +00002943 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002944 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002946 /* First check for possible aligned read of a C 'long'. Unaligned
2947 reads are more expensive, better to defer to another iteration. */
2948 if (!((size_t) q & LONG_PTR_MASK)) {
2949 /* Fast path for runs of non-surrogate chars. */
2950 register const unsigned char *_q = q;
2951 Py_UNICODE *_p = p;
2952 if (native_ordering) {
2953 /* Native ordering is simple: as long as the input cannot
2954 possibly contain a surrogate char, do an unrolled copy
2955 of several 16-bit code points to the target object.
2956 The non-surrogate check is done on several input bytes
2957 at a time (as many as a C 'long' can contain). */
2958 while (_q < aligned_end) {
2959 unsigned long data = * (unsigned long *) _q;
2960 if (data & FAST_CHAR_MASK)
2961 break;
2962 _p[0] = ((unsigned short *) _q)[0];
2963 _p[1] = ((unsigned short *) _q)[1];
2964#if (SIZEOF_LONG == 8)
2965 _p[2] = ((unsigned short *) _q)[2];
2966 _p[3] = ((unsigned short *) _q)[3];
2967#endif
2968 _q += SIZEOF_LONG;
2969 _p += SIZEOF_LONG / 2;
2970 }
2971 }
2972 else {
2973 /* Byteswapped ordering is similar, but we must decompose
2974 the copy bytewise, and take care of zero'ing out the
2975 upper bytes if the target object is in 32-bit units
2976 (that is, in UCS-4 builds). */
2977 while (_q < aligned_end) {
2978 unsigned long data = * (unsigned long *) _q;
2979 if (data & SWAPPED_FAST_CHAR_MASK)
2980 break;
2981 /* Zero upper bytes in UCS-4 builds */
2982#if (Py_UNICODE_SIZE > 2)
2983 _p[0] = 0;
2984 _p[1] = 0;
2985#if (SIZEOF_LONG == 8)
2986 _p[2] = 0;
2987 _p[3] = 0;
2988#endif
2989#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002990 /* Issue #4916; UCS-4 builds on big endian machines must
2991 fill the two last bytes of each 4-byte unit. */
2992#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2993# define OFF 2
2994#else
2995# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002996#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002997 ((unsigned char *) _p)[OFF + 1] = _q[0];
2998 ((unsigned char *) _p)[OFF + 0] = _q[1];
2999 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3000 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3001#if (SIZEOF_LONG == 8)
3002 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3003 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3004 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3005 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3006#endif
3007#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003008 _q += SIZEOF_LONG;
3009 _p += SIZEOF_LONG / 2;
3010 }
3011 }
3012 p = _p;
3013 q = _q;
3014 if (q >= e)
3015 break;
3016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018
Benjamin Peterson14339b62009-01-31 16:36:08 +00003019 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003020
3021 if (ch < 0xD800 || ch > 0xDFFF) {
3022 *p++ = ch;
3023 continue;
3024 }
3025
3026 /* UTF-16 code pair: */
3027 if (q > e) {
3028 errmsg = "unexpected end of data";
3029 startinpos = (((const char *)q) - 2) - starts;
3030 endinpos = ((const char *)e) + 1 - starts;
3031 goto utf16Error;
3032 }
3033 if (0xD800 <= ch && ch <= 0xDBFF) {
3034 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3035 q += 2;
3036 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003037#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 *p++ = ch;
3039 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003040#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003042#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 continue;
3044 }
3045 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 startinpos = (((const char *)q)-4)-starts;
3048 endinpos = startinpos+2;
3049 goto utf16Error;
3050 }
3051
Benjamin Peterson14339b62009-01-31 16:36:08 +00003052 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 errmsg = "illegal encoding";
3054 startinpos = (((const char *)q)-2)-starts;
3055 endinpos = startinpos+2;
3056 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 utf16Error:
3059 outpos = p - PyUnicode_AS_UNICODE(unicode);
3060 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003061 errors,
3062 &errorHandler,
3063 "utf16", errmsg,
3064 &starts,
3065 (const char **)&e,
3066 &startinpos,
3067 &endinpos,
3068 &exc,
3069 (const char **)&q,
3070 &unicode,
3071 &outpos,
3072 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003075 /* remaining byte at the end? (size should be even) */
3076 if (e == q) {
3077 if (!consumed) {
3078 errmsg = "truncated data";
3079 startinpos = ((const char *)q) - starts;
3080 endinpos = ((const char *)e) + 1 - starts;
3081 outpos = p - PyUnicode_AS_UNICODE(unicode);
3082 if (unicode_decode_call_errorhandler(
3083 errors,
3084 &errorHandler,
3085 "utf16", errmsg,
3086 &starts,
3087 (const char **)&e,
3088 &startinpos,
3089 &endinpos,
3090 &exc,
3091 (const char **)&q,
3092 &unicode,
3093 &outpos,
3094 &p))
3095 goto onError;
3096 /* The remaining input chars are ignored if the callback
3097 chooses to skip the input */
3098 }
3099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100
3101 if (byteorder)
3102 *byteorder = bo;
3103
Walter Dörwald69652032004-09-07 20:24:22 +00003104 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003108 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 goto onError;
3110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 Py_XDECREF(errorHandler);
3112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 return (PyObject *)unicode;
3114
Benjamin Peterson29060642009-01-31 22:14:21 +00003115 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 Py_XDECREF(errorHandler);
3118 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 return NULL;
3120}
3121
Antoine Pitrouab868312009-01-10 15:40:25 +00003122#undef FAST_CHAR_MASK
3123#undef SWAPPED_FAST_CHAR_MASK
3124
Tim Peters772747b2001-08-09 22:21:55 +00003125PyObject *
3126PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 Py_ssize_t size,
3128 const char *errors,
3129 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003131 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003132 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003133 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003134#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003135 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003136#else
3137 const int pairs = 0;
3138#endif
Tim Peters772747b2001-08-09 22:21:55 +00003139 /* Offsets from p for storing byte pairs in the right order. */
3140#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3141 int ihi = 1, ilo = 0;
3142#else
3143 int ihi = 0, ilo = 1;
3144#endif
3145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146#define STORECHAR(CH) \
3147 do { \
3148 p[ihi] = ((CH) >> 8) & 0xff; \
3149 p[ilo] = (CH) & 0xff; \
3150 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003151 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003153#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003154 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 if (s[i] >= 0x10000)
3156 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003157#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003158 /* 2 * (size + pairs + (byteorder == 0)) */
3159 if (size > PY_SSIZE_T_MAX ||
3160 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003162 nsize = size + pairs + (byteorder == 0);
3163 bytesize = nsize * 2;
3164 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003166 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (v == NULL)
3168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003170 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003173 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003174 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003175
3176 if (byteorder == -1) {
3177 /* force LE */
3178 ihi = 1;
3179 ilo = 0;
3180 }
3181 else if (byteorder == 1) {
3182 /* force BE */
3183 ihi = 0;
3184 ilo = 1;
3185 }
3186
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003187 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 Py_UNICODE ch = *s++;
3189 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003190#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 if (ch >= 0x10000) {
3192 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3193 ch = 0xD800 | ((ch-0x10000) >> 10);
3194 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003195#endif
Tim Peters772747b2001-08-09 22:21:55 +00003196 STORECHAR(ch);
3197 if (ch2)
3198 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003199 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003200
3201 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003202 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003203#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
3206PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3207{
3208 if (!PyUnicode_Check(unicode)) {
3209 PyErr_BadArgument();
3210 return NULL;
3211 }
3212 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 PyUnicode_GET_SIZE(unicode),
3214 NULL,
3215 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216}
3217
3218/* --- Unicode Escape Codec ----------------------------------------------- */
3219
Fredrik Lundh06d12682001-01-24 07:59:11 +00003220static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 Py_ssize_t size,
3224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003227 Py_ssize_t startinpos;
3228 Py_ssize_t endinpos;
3229 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003234 char* message;
3235 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 PyObject *errorHandler = NULL;
3237 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 /* Escaped strings will always be longer than the resulting
3240 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 length after conversion to the true value.
3242 (but if the error callback returns a long replacement string
3243 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 v = _PyUnicode_New(size);
3245 if (v == NULL)
3246 goto onError;
3247 if (size == 0)
3248 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003252
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 while (s < end) {
3254 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003255 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257
3258 /* Non-escape characters are interpreted as Unicode ordinals */
3259 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003260 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 continue;
3262 }
3263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 /* \ - Escapes */
3266 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003267 c = *s++;
3268 if (s > end)
3269 c = '\0'; /* Invalid after \ */
3270 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 case '\n': break;
3274 case '\\': *p++ = '\\'; break;
3275 case '\'': *p++ = '\''; break;
3276 case '\"': *p++ = '\"'; break;
3277 case 'b': *p++ = '\b'; break;
3278 case 'f': *p++ = '\014'; break; /* FF */
3279 case 't': *p++ = '\t'; break;
3280 case 'n': *p++ = '\n'; break;
3281 case 'r': *p++ = '\r'; break;
3282 case 'v': *p++ = '\013'; break; /* VT */
3283 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 case '0': case '1': case '2': case '3':
3287 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003288 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003289 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003290 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003291 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003292 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003294 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 break;
3296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 /* hex escapes */
3298 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003300 digits = 2;
3301 message = "truncated \\xXX escape";
3302 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003306 digits = 4;
3307 message = "truncated \\uXXXX escape";
3308 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003311 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003312 digits = 8;
3313 message = "truncated \\UXXXXXXXX escape";
3314 hexescape:
3315 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 outpos = p-PyUnicode_AS_UNICODE(v);
3317 if (s+digits>end) {
3318 endinpos = size;
3319 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003320 errors, &errorHandler,
3321 "unicodeescape", "end of string in escape sequence",
3322 &starts, &end, &startinpos, &endinpos, &exc, &s,
3323 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 goto onError;
3325 goto nextByte;
3326 }
3327 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003328 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003329 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 endinpos = (s+i+1)-starts;
3331 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 errors, &errorHandler,
3333 "unicodeescape", message,
3334 &starts, &end, &startinpos, &endinpos, &exc, &s,
3335 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003338 }
3339 chr = (chr<<4) & ~0xF;
3340 if (c >= '0' && c <= '9')
3341 chr += c - '0';
3342 else if (c >= 'a' && c <= 'f')
3343 chr += 10 + c - 'a';
3344 else
3345 chr += 10 + c - 'A';
3346 }
3347 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003348 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 /* _decoding_error will have already written into the
3350 target buffer. */
3351 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003352 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003353 /* when we get here, chr is a 32-bit unicode character */
3354 if (chr <= 0xffff)
3355 /* UCS-2 character */
3356 *p++ = (Py_UNICODE) chr;
3357 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003358 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003359 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003360#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003361 *p++ = chr;
3362#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003363 chr -= 0x10000L;
3364 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003365 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003366#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003367 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 endinpos = s-starts;
3369 outpos = p-PyUnicode_AS_UNICODE(v);
3370 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 errors, &errorHandler,
3372 "unicodeescape", "illegal Unicode character",
3373 &starts, &end, &startinpos, &endinpos, &exc, &s,
3374 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003375 goto onError;
3376 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003377 break;
3378
Benjamin Peterson29060642009-01-31 22:14:21 +00003379 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003380 case 'N':
3381 message = "malformed \\N character escape";
3382 if (ucnhash_CAPI == NULL) {
3383 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003384 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003385 if (ucnhash_CAPI == NULL)
3386 goto ucnhashError;
3387 }
3388 if (*s == '{') {
3389 const char *start = s+1;
3390 /* look for the closing brace */
3391 while (*s != '}' && s < end)
3392 s++;
3393 if (s > start && s < end && *s == '}') {
3394 /* found a name. look it up in the unicode database */
3395 message = "unknown Unicode character name";
3396 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003397 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003398 goto store;
3399 }
3400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 endinpos = s-starts;
3402 outpos = p-PyUnicode_AS_UNICODE(v);
3403 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 errors, &errorHandler,
3405 "unicodeescape", message,
3406 &starts, &end, &startinpos, &endinpos, &exc, &s,
3407 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003408 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003409 break;
3410
3411 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003412 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 message = "\\ at end of string";
3414 s--;
3415 endinpos = s-starts;
3416 outpos = p-PyUnicode_AS_UNICODE(v);
3417 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 errors, &errorHandler,
3419 "unicodeescape", message,
3420 &starts, &end, &startinpos, &endinpos, &exc, &s,
3421 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003422 goto onError;
3423 }
3424 else {
3425 *p++ = '\\';
3426 *p++ = (unsigned char)s[-1];
3427 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003428 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003433 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003435 Py_XDECREF(errorHandler);
3436 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003438
Benjamin Peterson29060642009-01-31 22:14:21 +00003439 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003440 PyErr_SetString(
3441 PyExc_UnicodeError,
3442 "\\N escapes not supported (can't load unicodedata module)"
3443 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003444 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 Py_XDECREF(errorHandler);
3446 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003447 return NULL;
3448
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 Py_XDECREF(errorHandler);
3452 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 return NULL;
3454}
3455
3456/* Return a Unicode-Escape string version of the Unicode object.
3457
3458 If quotes is true, the string is enclosed in u"" or u'' quotes as
3459 appropriate.
3460
3461*/
3462
Thomas Wouters477c8d52006-05-27 19:21:47 +00003463Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 Py_ssize_t size,
3465 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003466{
3467 /* like wcschr, but doesn't stop at NULL characters */
3468
3469 while (size-- > 0) {
3470 if (*s == ch)
3471 return s;
3472 s++;
3473 }
3474
3475 return NULL;
3476}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003477
Walter Dörwald79e913e2007-05-12 11:08:06 +00003478static const char *hexdigits = "0123456789abcdef";
3479
3480PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003483 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003486#ifdef Py_UNICODE_WIDE
3487 const Py_ssize_t expandsize = 10;
3488#else
3489 const Py_ssize_t expandsize = 6;
3490#endif
3491
Thomas Wouters89f507f2006-12-13 04:49:30 +00003492 /* XXX(nnorwitz): rather than over-allocating, it would be
3493 better to choose a different scheme. Perhaps scan the
3494 first N-chars of the string and allocate based on that size.
3495 */
3496 /* Initial allocation is based on the longest-possible unichr
3497 escape.
3498
3499 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3500 unichr, so in this case it's the longest unichr escape. In
3501 narrow (UTF-16) builds this is five chars per source unichr
3502 since there are two unichrs in the surrogate pair, so in narrow
3503 (UTF-16) builds it's not the longest unichr escape.
3504
3505 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3506 so in the narrow (UTF-16) build case it's the longest unichr
3507 escape.
3508 */
3509
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003510 if (size == 0)
3511 return PyBytes_FromStringAndSize(NULL, 0);
3512
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003513 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003515
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003516 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 2
3518 + expandsize*size
3519 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (repr == NULL)
3521 return NULL;
3522
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003523 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 while (size-- > 0) {
3526 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003527
Walter Dörwald79e913e2007-05-12 11:08:06 +00003528 /* Escape backslashes */
3529 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 *p++ = '\\';
3531 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003532 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003533 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003534
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003535#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003536 /* Map 21-bit characters to '\U00xxxxxx' */
3537 else if (ch >= 0x10000) {
3538 *p++ = '\\';
3539 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003540 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3541 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3542 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3543 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3544 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3545 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3546 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3547 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003548 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003549 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003550#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3552 else if (ch >= 0xD800 && ch < 0xDC00) {
3553 Py_UNICODE ch2;
3554 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003555
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 ch2 = *s++;
3557 size--;
3558 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3559 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3560 *p++ = '\\';
3561 *p++ = 'U';
3562 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3563 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3564 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3565 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3566 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3567 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3568 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3569 *p++ = hexdigits[ucs & 0x0000000F];
3570 continue;
3571 }
3572 /* Fall through: isolated surrogates are copied as-is */
3573 s--;
3574 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003575 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003576#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003577
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003579 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 *p++ = '\\';
3581 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003582 *p++ = hexdigits[(ch >> 12) & 0x000F];
3583 *p++ = hexdigits[(ch >> 8) & 0x000F];
3584 *p++ = hexdigits[(ch >> 4) & 0x000F];
3585 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003587
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003588 /* Map special whitespace to '\t', \n', '\r' */
3589 else if (ch == '\t') {
3590 *p++ = '\\';
3591 *p++ = 't';
3592 }
3593 else if (ch == '\n') {
3594 *p++ = '\\';
3595 *p++ = 'n';
3596 }
3597 else if (ch == '\r') {
3598 *p++ = '\\';
3599 *p++ = 'r';
3600 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003601
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003602 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003603 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003605 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003606 *p++ = hexdigits[(ch >> 4) & 0x000F];
3607 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003608 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003609
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 /* Copy everything else as-is */
3611 else
3612 *p++ = (char) ch;
3613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003615 assert(p - PyBytes_AS_STRING(repr) > 0);
3616 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3617 return NULL;
3618 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619}
3620
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003621PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003623 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 if (!PyUnicode_Check(unicode)) {
3625 PyErr_BadArgument();
3626 return NULL;
3627 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003628 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3629 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003630 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631}
3632
3633/* --- Raw Unicode Escape Codec ------------------------------------------- */
3634
3635PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 Py_ssize_t size,
3637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003640 Py_ssize_t startinpos;
3641 Py_ssize_t endinpos;
3642 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 const char *end;
3646 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 PyObject *errorHandler = NULL;
3648 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 /* Escaped strings will always be longer than the resulting
3651 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 length after conversion to the true value. (But decoding error
3653 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 v = _PyUnicode_New(size);
3655 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 end = s + size;
3661 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 unsigned char c;
3663 Py_UCS4 x;
3664 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003665 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 /* Non-escape characters are interpreted as Unicode ordinals */
3668 if (*s != '\\') {
3669 *p++ = (unsigned char)*s++;
3670 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 startinpos = s-starts;
3673
3674 /* \u-escapes are only interpreted iff the number of leading
3675 backslashes if odd */
3676 bs = s;
3677 for (;s < end;) {
3678 if (*s != '\\')
3679 break;
3680 *p++ = (unsigned char)*s++;
3681 }
3682 if (((s - bs) & 1) == 0 ||
3683 s >= end ||
3684 (*s != 'u' && *s != 'U')) {
3685 continue;
3686 }
3687 p--;
3688 count = *s=='u' ? 4 : 8;
3689 s++;
3690
3691 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3692 outpos = p-PyUnicode_AS_UNICODE(v);
3693 for (x = 0, i = 0; i < count; ++i, ++s) {
3694 c = (unsigned char)*s;
3695 if (!ISXDIGIT(c)) {
3696 endinpos = s-starts;
3697 if (unicode_decode_call_errorhandler(
3698 errors, &errorHandler,
3699 "rawunicodeescape", "truncated \\uXXXX",
3700 &starts, &end, &startinpos, &endinpos, &exc, &s,
3701 &v, &outpos, &p))
3702 goto onError;
3703 goto nextByte;
3704 }
3705 x = (x<<4) & ~0xF;
3706 if (c >= '0' && c <= '9')
3707 x += c - '0';
3708 else if (c >= 'a' && c <= 'f')
3709 x += 10 + c - 'a';
3710 else
3711 x += 10 + c - 'A';
3712 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003713 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 /* UCS-2 character */
3715 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003716 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 /* UCS-4 character. Either store directly, or as
3718 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003719#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003721#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003722 x -= 0x10000L;
3723 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3724 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003725#endif
3726 } else {
3727 endinpos = s-starts;
3728 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003729 if (unicode_decode_call_errorhandler(
3730 errors, &errorHandler,
3731 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 &starts, &end, &startinpos, &endinpos, &exc, &s,
3733 &v, &outpos, &p))
3734 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 nextByte:
3737 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003739 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 Py_XDECREF(errorHandler);
3742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003744
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_XDECREF(errorHandler);
3748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 return NULL;
3750}
3751
3752PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003755 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 char *p;
3757 char *q;
3758
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003759#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003760 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003761#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003762 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003763#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003764
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003765 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003767
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003768 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 if (repr == NULL)
3770 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003771 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003772 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003774 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 while (size-- > 0) {
3776 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003777#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 /* Map 32-bit characters to '\Uxxxxxxxx' */
3779 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003780 *p++ = '\\';
3781 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003782 *p++ = hexdigits[(ch >> 28) & 0xf];
3783 *p++ = hexdigits[(ch >> 24) & 0xf];
3784 *p++ = hexdigits[(ch >> 20) & 0xf];
3785 *p++ = hexdigits[(ch >> 16) & 0xf];
3786 *p++ = hexdigits[(ch >> 12) & 0xf];
3787 *p++ = hexdigits[(ch >> 8) & 0xf];
3788 *p++ = hexdigits[(ch >> 4) & 0xf];
3789 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003790 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003791 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003792#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3794 if (ch >= 0xD800 && ch < 0xDC00) {
3795 Py_UNICODE ch2;
3796 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003797
Benjamin Peterson29060642009-01-31 22:14:21 +00003798 ch2 = *s++;
3799 size--;
3800 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3801 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3802 *p++ = '\\';
3803 *p++ = 'U';
3804 *p++ = hexdigits[(ucs >> 28) & 0xf];
3805 *p++ = hexdigits[(ucs >> 24) & 0xf];
3806 *p++ = hexdigits[(ucs >> 20) & 0xf];
3807 *p++ = hexdigits[(ucs >> 16) & 0xf];
3808 *p++ = hexdigits[(ucs >> 12) & 0xf];
3809 *p++ = hexdigits[(ucs >> 8) & 0xf];
3810 *p++ = hexdigits[(ucs >> 4) & 0xf];
3811 *p++ = hexdigits[ucs & 0xf];
3812 continue;
3813 }
3814 /* Fall through: isolated surrogates are copied as-is */
3815 s--;
3816 size++;
3817 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003818#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 /* Map 16-bit characters to '\uxxxx' */
3820 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 *p++ = '\\';
3822 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003823 *p++ = hexdigits[(ch >> 12) & 0xf];
3824 *p++ = hexdigits[(ch >> 8) & 0xf];
3825 *p++ = hexdigits[(ch >> 4) & 0xf];
3826 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 /* Copy everything else as-is */
3829 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 *p++ = (char) ch;
3831 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003832 size = p - q;
3833
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003834 assert(size > 0);
3835 if (_PyBytes_Resize(&repr, size) < 0)
3836 return NULL;
3837 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838}
3839
3840PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3841{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003842 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003844 PyErr_BadArgument();
3845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003847 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3848 PyUnicode_GET_SIZE(unicode));
3849
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003850 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851}
3852
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003853/* --- Unicode Internal Codec ------------------------------------------- */
3854
3855PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003856 Py_ssize_t size,
3857 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003858{
3859 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003860 Py_ssize_t startinpos;
3861 Py_ssize_t endinpos;
3862 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003863 PyUnicodeObject *v;
3864 Py_UNICODE *p;
3865 const char *end;
3866 const char *reason;
3867 PyObject *errorHandler = NULL;
3868 PyObject *exc = NULL;
3869
Neal Norwitzd43069c2006-01-08 01:12:10 +00003870#ifdef Py_UNICODE_WIDE
3871 Py_UNICODE unimax = PyUnicode_GetMax();
3872#endif
3873
Thomas Wouters89f507f2006-12-13 04:49:30 +00003874 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003875 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3876 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003877 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003878 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003879 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003880 p = PyUnicode_AS_UNICODE(v);
3881 end = s + size;
3882
3883 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003884 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003885 /* We have to sanity check the raw data, otherwise doom looms for
3886 some malformed UCS-4 data. */
3887 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003888#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003889 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003890#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003891 end-s < Py_UNICODE_SIZE
3892 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003894 startinpos = s - starts;
3895 if (end-s < Py_UNICODE_SIZE) {
3896 endinpos = end-starts;
3897 reason = "truncated input";
3898 }
3899 else {
3900 endinpos = s - starts + Py_UNICODE_SIZE;
3901 reason = "illegal code point (> 0x10FFFF)";
3902 }
3903 outpos = p - PyUnicode_AS_UNICODE(v);
3904 if (unicode_decode_call_errorhandler(
3905 errors, &errorHandler,
3906 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003907 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003908 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003909 goto onError;
3910 }
3911 }
3912 else {
3913 p++;
3914 s += Py_UNICODE_SIZE;
3915 }
3916 }
3917
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003918 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003919 goto onError;
3920 Py_XDECREF(errorHandler);
3921 Py_XDECREF(exc);
3922 return (PyObject *)v;
3923
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003925 Py_XDECREF(v);
3926 Py_XDECREF(errorHandler);
3927 Py_XDECREF(exc);
3928 return NULL;
3929}
3930
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931/* --- Latin-1 Codec ------------------------------------------------------ */
3932
3933PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 Py_ssize_t size,
3935 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936{
3937 PyUnicodeObject *v;
3938 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003939 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003940
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003942 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_UNICODE r = *(unsigned char*)s;
3944 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003945 }
3946
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 v = _PyUnicode_New(size);
3948 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003953 e = s + size;
3954 /* Unrolling the copy makes it much faster by reducing the looping
3955 overhead. This is similar to what many memcpy() implementations do. */
3956 unrolled_end = e - 4;
3957 while (s < unrolled_end) {
3958 p[0] = (unsigned char) s[0];
3959 p[1] = (unsigned char) s[1];
3960 p[2] = (unsigned char) s[2];
3961 p[3] = (unsigned char) s[3];
3962 s += 4;
3963 p += 4;
3964 }
3965 while (s < e)
3966 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003968
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_XDECREF(v);
3971 return NULL;
3972}
3973
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974/* create or adjust a UnicodeEncodeError */
3975static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 const char *encoding,
3977 const Py_UNICODE *unicode, Py_ssize_t size,
3978 Py_ssize_t startpos, Py_ssize_t endpos,
3979 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 *exceptionObject = PyUnicodeEncodeError_Create(
3983 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 }
3985 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3987 goto onError;
3988 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3989 goto onError;
3990 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3991 goto onError;
3992 return;
3993 onError:
3994 Py_DECREF(*exceptionObject);
3995 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 }
3997}
3998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999/* raises a UnicodeEncodeError */
4000static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 const char *encoding,
4002 const Py_UNICODE *unicode, Py_ssize_t size,
4003 Py_ssize_t startpos, Py_ssize_t endpos,
4004 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005{
4006 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010}
4011
4012/* error handling callback helper:
4013 build arguments, call the callback and check the arguments,
4014 put the result into newpos and return the replacement string, which
4015 has to be freed by the caller */
4016static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 PyObject **errorHandler,
4018 const char *encoding, const char *reason,
4019 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4020 Py_ssize_t startpos, Py_ssize_t endpos,
4021 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004023 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024
4025 PyObject *restuple;
4026 PyObject *resunicode;
4027
4028 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 }
4033
4034 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038
4039 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004044 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 Py_DECREF(restuple);
4046 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004048 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 &resunicode, newpos)) {
4050 Py_DECREF(restuple);
4051 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004053 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4054 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4055 Py_DECREF(restuple);
4056 return NULL;
4057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004060 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4062 Py_DECREF(restuple);
4063 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 Py_INCREF(resunicode);
4066 Py_DECREF(restuple);
4067 return resunicode;
4068}
4069
4070static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 Py_ssize_t size,
4072 const char *errors,
4073 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074{
4075 /* output object */
4076 PyObject *res;
4077 /* pointers to the beginning and end+1 of input */
4078 const Py_UNICODE *startp = p;
4079 const Py_UNICODE *endp = p + size;
4080 /* pointer to the beginning of the unencodable characters */
4081 /* const Py_UNICODE *badp = NULL; */
4082 /* pointer into the output */
4083 char *str;
4084 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004086 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4087 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 PyObject *errorHandler = NULL;
4089 PyObject *exc = NULL;
4090 /* the following variable is used for caching string comparisons
4091 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4092 int known_errorHandler = -1;
4093
4094 /* allocate enough for a simple encoding without
4095 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004096 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004097 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004098 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004100 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004101 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 ressize = size;
4103
4104 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 /* can we encode this? */
4108 if (c<limit) {
4109 /* no overflow check, because we know that the space is enough */
4110 *str++ = (char)c;
4111 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 else {
4114 Py_ssize_t unicodepos = p-startp;
4115 Py_ssize_t requiredsize;
4116 PyObject *repunicode;
4117 Py_ssize_t repsize;
4118 Py_ssize_t newpos;
4119 Py_ssize_t respos;
4120 Py_UNICODE *uni2;
4121 /* startpos for collecting unencodable chars */
4122 const Py_UNICODE *collstart = p;
4123 const Py_UNICODE *collend = p;
4124 /* find all unecodable characters */
4125 while ((collend < endp) && ((*collend)>=limit))
4126 ++collend;
4127 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4128 if (known_errorHandler==-1) {
4129 if ((errors==NULL) || (!strcmp(errors, "strict")))
4130 known_errorHandler = 1;
4131 else if (!strcmp(errors, "replace"))
4132 known_errorHandler = 2;
4133 else if (!strcmp(errors, "ignore"))
4134 known_errorHandler = 3;
4135 else if (!strcmp(errors, "xmlcharrefreplace"))
4136 known_errorHandler = 4;
4137 else
4138 known_errorHandler = 0;
4139 }
4140 switch (known_errorHandler) {
4141 case 1: /* strict */
4142 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4143 goto onError;
4144 case 2: /* replace */
4145 while (collstart++<collend)
4146 *str++ = '?'; /* fall through */
4147 case 3: /* ignore */
4148 p = collend;
4149 break;
4150 case 4: /* xmlcharrefreplace */
4151 respos = str - PyBytes_AS_STRING(res);
4152 /* determine replacement size (temporarily (mis)uses p) */
4153 for (p = collstart, repsize = 0; p < collend; ++p) {
4154 if (*p<10)
4155 repsize += 2+1+1;
4156 else if (*p<100)
4157 repsize += 2+2+1;
4158 else if (*p<1000)
4159 repsize += 2+3+1;
4160 else if (*p<10000)
4161 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004162#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 else
4164 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004165#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 else if (*p<100000)
4167 repsize += 2+5+1;
4168 else if (*p<1000000)
4169 repsize += 2+6+1;
4170 else
4171 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004172#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 }
4174 requiredsize = respos+repsize+(endp-collend);
4175 if (requiredsize > ressize) {
4176 if (requiredsize<2*ressize)
4177 requiredsize = 2*ressize;
4178 if (_PyBytes_Resize(&res, requiredsize))
4179 goto onError;
4180 str = PyBytes_AS_STRING(res) + respos;
4181 ressize = requiredsize;
4182 }
4183 /* generate replacement (temporarily (mis)uses p) */
4184 for (p = collstart; p < collend; ++p) {
4185 str += sprintf(str, "&#%d;", (int)*p);
4186 }
4187 p = collend;
4188 break;
4189 default:
4190 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4191 encoding, reason, startp, size, &exc,
4192 collstart-startp, collend-startp, &newpos);
4193 if (repunicode == NULL)
4194 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004195 if (PyBytes_Check(repunicode)) {
4196 /* Directly copy bytes result to output. */
4197 repsize = PyBytes_Size(repunicode);
4198 if (repsize > 1) {
4199 /* Make room for all additional bytes. */
4200 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4201 Py_DECREF(repunicode);
4202 goto onError;
4203 }
4204 ressize += repsize-1;
4205 }
4206 memcpy(str, PyBytes_AsString(repunicode), repsize);
4207 str += repsize;
4208 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004209 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004210 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 /* need more space? (at least enough for what we
4213 have+the replacement+the rest of the string, so
4214 we won't have to check space for encodable characters) */
4215 respos = str - PyBytes_AS_STRING(res);
4216 repsize = PyUnicode_GET_SIZE(repunicode);
4217 requiredsize = respos+repsize+(endp-collend);
4218 if (requiredsize > ressize) {
4219 if (requiredsize<2*ressize)
4220 requiredsize = 2*ressize;
4221 if (_PyBytes_Resize(&res, requiredsize)) {
4222 Py_DECREF(repunicode);
4223 goto onError;
4224 }
4225 str = PyBytes_AS_STRING(res) + respos;
4226 ressize = requiredsize;
4227 }
4228 /* check if there is anything unencodable in the replacement
4229 and copy it to the output */
4230 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4231 c = *uni2;
4232 if (c >= limit) {
4233 raise_encode_exception(&exc, encoding, startp, size,
4234 unicodepos, unicodepos+1, reason);
4235 Py_DECREF(repunicode);
4236 goto onError;
4237 }
4238 *str = (char)c;
4239 }
4240 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004241 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004243 }
4244 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004245 /* Resize if we allocated to much */
4246 size = str - PyBytes_AS_STRING(res);
4247 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004248 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004249 if (_PyBytes_Resize(&res, size) < 0)
4250 goto onError;
4251 }
4252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 Py_XDECREF(errorHandler);
4254 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004255 return res;
4256
4257 onError:
4258 Py_XDECREF(res);
4259 Py_XDECREF(errorHandler);
4260 Py_XDECREF(exc);
4261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262}
4263
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 Py_ssize_t size,
4266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269}
4270
4271PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4272{
4273 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 PyErr_BadArgument();
4275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
4277 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 PyUnicode_GET_SIZE(unicode),
4279 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280}
4281
4282/* --- 7-bit ASCII Codec -------------------------------------------------- */
4283
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 Py_ssize_t size,
4286 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 PyUnicodeObject *v;
4290 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 Py_ssize_t startinpos;
4292 Py_ssize_t endinpos;
4293 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 const char *e;
4295 PyObject *errorHandler = NULL;
4296 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004297
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004299 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 Py_UNICODE r = *(unsigned char*)s;
4301 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004302 }
Tim Petersced69f82003-09-16 20:30:58 +00004303
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 v = _PyUnicode_New(size);
4305 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 e = s + size;
4311 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 register unsigned char c = (unsigned char)*s;
4313 if (c < 128) {
4314 *p++ = c;
4315 ++s;
4316 }
4317 else {
4318 startinpos = s-starts;
4319 endinpos = startinpos + 1;
4320 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4321 if (unicode_decode_call_errorhandler(
4322 errors, &errorHandler,
4323 "ascii", "ordinal not in range(128)",
4324 &starts, &e, &startinpos, &endinpos, &exc, &s,
4325 &v, &outpos, &p))
4326 goto onError;
4327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004329 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4331 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 Py_XDECREF(errorHandler);
4333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 Py_XDECREF(errorHandler);
4339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 return NULL;
4341}
4342
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 Py_ssize_t size,
4345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348}
4349
4350PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4351{
4352 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 PyErr_BadArgument();
4354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 }
4356 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 PyUnicode_GET_SIZE(unicode),
4358 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359}
4360
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004361#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004362
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004363/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004364
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004365#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004366#define NEED_RETRY
4367#endif
4368
4369/* XXX This code is limited to "true" double-byte encodings, as
4370 a) it assumes an incomplete character consists of a single byte, and
4371 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004373
4374static int is_dbcs_lead_byte(const char *s, int offset)
4375{
4376 const char *curr = s + offset;
4377
4378 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 const char *prev = CharPrev(s, curr);
4380 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004381 }
4382 return 0;
4383}
4384
4385/*
4386 * Decode MBCS string into unicode object. If 'final' is set, converts
4387 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4388 */
4389static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 const char *s, /* MBCS string */
4391 int size, /* sizeof MBCS string */
4392 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004393{
4394 Py_UNICODE *p;
4395 Py_ssize_t n = 0;
4396 int usize = 0;
4397
4398 assert(size >= 0);
4399
4400 /* Skip trailing lead-byte unless 'final' is set */
4401 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004403
4404 /* First get the size of the result */
4405 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4407 if (usize == 0) {
4408 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4409 return -1;
4410 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004411 }
4412
4413 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 /* Create unicode object */
4415 *v = _PyUnicode_New(usize);
4416 if (*v == NULL)
4417 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004418 }
4419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 /* Extend unicode object */
4421 n = PyUnicode_GET_SIZE(*v);
4422 if (_PyUnicode_Resize(v, n + usize) < 0)
4423 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004424 }
4425
4426 /* Do the conversion */
4427 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 p = PyUnicode_AS_UNICODE(*v) + n;
4429 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4430 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4431 return -1;
4432 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004433 }
4434
4435 return size;
4436}
4437
4438PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 Py_ssize_t size,
4440 const char *errors,
4441 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004442{
4443 PyUnicodeObject *v = NULL;
4444 int done;
4445
4446 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004448
4449#ifdef NEED_RETRY
4450 retry:
4451 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004453 else
4454#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004456
4457 if (done < 0) {
4458 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004460 }
4461
4462 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004464
4465#ifdef NEED_RETRY
4466 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 s += done;
4468 size -= done;
4469 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004470 }
4471#endif
4472
4473 return (PyObject *)v;
4474}
4475
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004476PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 Py_ssize_t size,
4478 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004479{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004480 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4481}
4482
4483/*
4484 * Convert unicode into string object (MBCS).
4485 * Returns 0 if succeed, -1 otherwise.
4486 */
4487static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 const Py_UNICODE *p, /* unicode */
4489 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004490{
4491 int mbcssize = 0;
4492 Py_ssize_t n = 0;
4493
4494 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004495
4496 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004497 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4499 if (mbcssize == 0) {
4500 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4501 return -1;
4502 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004503 }
4504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004505 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 /* Create string object */
4507 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4508 if (*repr == NULL)
4509 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004510 }
4511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 /* Extend string object */
4513 n = PyBytes_Size(*repr);
4514 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4515 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004516 }
4517
4518 /* Do the conversion */
4519 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 char *s = PyBytes_AS_STRING(*repr) + n;
4521 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4522 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4523 return -1;
4524 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004525 }
4526
4527 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004528}
4529
4530PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 Py_ssize_t size,
4532 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004533{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004534 PyObject *repr = NULL;
4535 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004536
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004537#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004539 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004541 else
4542#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004544
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 Py_XDECREF(repr);
4547 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004548 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549
4550#ifdef NEED_RETRY
4551 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 p += INT_MAX;
4553 size -= INT_MAX;
4554 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555 }
4556#endif
4557
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004558 return repr;
4559}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004560
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004561PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4562{
4563 if (!PyUnicode_Check(unicode)) {
4564 PyErr_BadArgument();
4565 return NULL;
4566 }
4567 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 PyUnicode_GET_SIZE(unicode),
4569 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004570}
4571
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572#undef NEED_RETRY
4573
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004574#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004575
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576/* --- Character Mapping Codec -------------------------------------------- */
4577
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 Py_ssize_t size,
4580 PyObject *mapping,
4581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004584 Py_ssize_t startinpos;
4585 Py_ssize_t endinpos;
4586 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 PyUnicodeObject *v;
4589 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004590 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 PyObject *errorHandler = NULL;
4592 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004593 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004594 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004595
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 /* Default to Latin-1 */
4597 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599
4600 v = _PyUnicode_New(size);
4601 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004607 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 mapstring = PyUnicode_AS_UNICODE(mapping);
4609 maplen = PyUnicode_GET_SIZE(mapping);
4610 while (s < e) {
4611 unsigned char ch = *s;
4612 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 if (ch < maplen)
4615 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 if (x == 0xfffe) {
4618 /* undefined mapping */
4619 outpos = p-PyUnicode_AS_UNICODE(v);
4620 startinpos = s-starts;
4621 endinpos = startinpos+1;
4622 if (unicode_decode_call_errorhandler(
4623 errors, &errorHandler,
4624 "charmap", "character maps to <undefined>",
4625 &starts, &e, &startinpos, &endinpos, &exc, &s,
4626 &v, &outpos, &p)) {
4627 goto onError;
4628 }
4629 continue;
4630 }
4631 *p++ = x;
4632 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004633 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004634 }
4635 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 while (s < e) {
4637 unsigned char ch = *s;
4638 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004639
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4641 w = PyLong_FromLong((long)ch);
4642 if (w == NULL)
4643 goto onError;
4644 x = PyObject_GetItem(mapping, w);
4645 Py_DECREF(w);
4646 if (x == NULL) {
4647 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4648 /* No mapping found means: mapping is undefined. */
4649 PyErr_Clear();
4650 x = Py_None;
4651 Py_INCREF(x);
4652 } else
4653 goto onError;
4654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004655
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 /* Apply mapping */
4657 if (PyLong_Check(x)) {
4658 long value = PyLong_AS_LONG(x);
4659 if (value < 0 || value > 65535) {
4660 PyErr_SetString(PyExc_TypeError,
4661 "character mapping must be in range(65536)");
4662 Py_DECREF(x);
4663 goto onError;
4664 }
4665 *p++ = (Py_UNICODE)value;
4666 }
4667 else if (x == Py_None) {
4668 /* undefined mapping */
4669 outpos = p-PyUnicode_AS_UNICODE(v);
4670 startinpos = s-starts;
4671 endinpos = startinpos+1;
4672 if (unicode_decode_call_errorhandler(
4673 errors, &errorHandler,
4674 "charmap", "character maps to <undefined>",
4675 &starts, &e, &startinpos, &endinpos, &exc, &s,
4676 &v, &outpos, &p)) {
4677 Py_DECREF(x);
4678 goto onError;
4679 }
4680 Py_DECREF(x);
4681 continue;
4682 }
4683 else if (PyUnicode_Check(x)) {
4684 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004685
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 if (targetsize == 1)
4687 /* 1-1 mapping */
4688 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004689
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 else if (targetsize > 1) {
4691 /* 1-n mapping */
4692 if (targetsize > extrachars) {
4693 /* resize first */
4694 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4695 Py_ssize_t needed = (targetsize - extrachars) + \
4696 (targetsize << 2);
4697 extrachars += needed;
4698 /* XXX overflow detection missing */
4699 if (_PyUnicode_Resize(&v,
4700 PyUnicode_GET_SIZE(v) + needed) < 0) {
4701 Py_DECREF(x);
4702 goto onError;
4703 }
4704 p = PyUnicode_AS_UNICODE(v) + oldpos;
4705 }
4706 Py_UNICODE_COPY(p,
4707 PyUnicode_AS_UNICODE(x),
4708 targetsize);
4709 p += targetsize;
4710 extrachars -= targetsize;
4711 }
4712 /* 1-0 mapping: skip the character */
4713 }
4714 else {
4715 /* wrong return value */
4716 PyErr_SetString(PyExc_TypeError,
4717 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004718 Py_DECREF(x);
4719 goto onError;
4720 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 Py_DECREF(x);
4722 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 }
4725 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4727 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 Py_XDECREF(v);
4736 return NULL;
4737}
4738
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004739/* Charmap encoding: the lookup table */
4740
4741struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 PyObject_HEAD
4743 unsigned char level1[32];
4744 int count2, count3;
4745 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004746};
4747
4748static PyObject*
4749encoding_map_size(PyObject *obj, PyObject* args)
4750{
4751 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004752 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004754}
4755
4756static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004757 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 PyDoc_STR("Return the size (in bytes) of this object") },
4759 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004760};
4761
4762static void
4763encoding_map_dealloc(PyObject* o)
4764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004766}
4767
4768static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004769 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 "EncodingMap", /*tp_name*/
4771 sizeof(struct encoding_map), /*tp_basicsize*/
4772 0, /*tp_itemsize*/
4773 /* methods */
4774 encoding_map_dealloc, /*tp_dealloc*/
4775 0, /*tp_print*/
4776 0, /*tp_getattr*/
4777 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004778 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 0, /*tp_repr*/
4780 0, /*tp_as_number*/
4781 0, /*tp_as_sequence*/
4782 0, /*tp_as_mapping*/
4783 0, /*tp_hash*/
4784 0, /*tp_call*/
4785 0, /*tp_str*/
4786 0, /*tp_getattro*/
4787 0, /*tp_setattro*/
4788 0, /*tp_as_buffer*/
4789 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4790 0, /*tp_doc*/
4791 0, /*tp_traverse*/
4792 0, /*tp_clear*/
4793 0, /*tp_richcompare*/
4794 0, /*tp_weaklistoffset*/
4795 0, /*tp_iter*/
4796 0, /*tp_iternext*/
4797 encoding_map_methods, /*tp_methods*/
4798 0, /*tp_members*/
4799 0, /*tp_getset*/
4800 0, /*tp_base*/
4801 0, /*tp_dict*/
4802 0, /*tp_descr_get*/
4803 0, /*tp_descr_set*/
4804 0, /*tp_dictoffset*/
4805 0, /*tp_init*/
4806 0, /*tp_alloc*/
4807 0, /*tp_new*/
4808 0, /*tp_free*/
4809 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004810};
4811
4812PyObject*
4813PyUnicode_BuildEncodingMap(PyObject* string)
4814{
4815 Py_UNICODE *decode;
4816 PyObject *result;
4817 struct encoding_map *mresult;
4818 int i;
4819 int need_dict = 0;
4820 unsigned char level1[32];
4821 unsigned char level2[512];
4822 unsigned char *mlevel1, *mlevel2, *mlevel3;
4823 int count2 = 0, count3 = 0;
4824
4825 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4826 PyErr_BadArgument();
4827 return NULL;
4828 }
4829 decode = PyUnicode_AS_UNICODE(string);
4830 memset(level1, 0xFF, sizeof level1);
4831 memset(level2, 0xFF, sizeof level2);
4832
4833 /* If there isn't a one-to-one mapping of NULL to \0,
4834 or if there are non-BMP characters, we need to use
4835 a mapping dictionary. */
4836 if (decode[0] != 0)
4837 need_dict = 1;
4838 for (i = 1; i < 256; i++) {
4839 int l1, l2;
4840 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004841#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004842 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004843#endif
4844 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004845 need_dict = 1;
4846 break;
4847 }
4848 if (decode[i] == 0xFFFE)
4849 /* unmapped character */
4850 continue;
4851 l1 = decode[i] >> 11;
4852 l2 = decode[i] >> 7;
4853 if (level1[l1] == 0xFF)
4854 level1[l1] = count2++;
4855 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004856 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004857 }
4858
4859 if (count2 >= 0xFF || count3 >= 0xFF)
4860 need_dict = 1;
4861
4862 if (need_dict) {
4863 PyObject *result = PyDict_New();
4864 PyObject *key, *value;
4865 if (!result)
4866 return NULL;
4867 for (i = 0; i < 256; i++) {
4868 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004869 key = PyLong_FromLong(decode[i]);
4870 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004871 if (!key || !value)
4872 goto failed1;
4873 if (PyDict_SetItem(result, key, value) == -1)
4874 goto failed1;
4875 Py_DECREF(key);
4876 Py_DECREF(value);
4877 }
4878 return result;
4879 failed1:
4880 Py_XDECREF(key);
4881 Py_XDECREF(value);
4882 Py_DECREF(result);
4883 return NULL;
4884 }
4885
4886 /* Create a three-level trie */
4887 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4888 16*count2 + 128*count3 - 1);
4889 if (!result)
4890 return PyErr_NoMemory();
4891 PyObject_Init(result, &EncodingMapType);
4892 mresult = (struct encoding_map*)result;
4893 mresult->count2 = count2;
4894 mresult->count3 = count3;
4895 mlevel1 = mresult->level1;
4896 mlevel2 = mresult->level23;
4897 mlevel3 = mresult->level23 + 16*count2;
4898 memcpy(mlevel1, level1, 32);
4899 memset(mlevel2, 0xFF, 16*count2);
4900 memset(mlevel3, 0, 128*count3);
4901 count3 = 0;
4902 for (i = 1; i < 256; i++) {
4903 int o1, o2, o3, i2, i3;
4904 if (decode[i] == 0xFFFE)
4905 /* unmapped character */
4906 continue;
4907 o1 = decode[i]>>11;
4908 o2 = (decode[i]>>7) & 0xF;
4909 i2 = 16*mlevel1[o1] + o2;
4910 if (mlevel2[i2] == 0xFF)
4911 mlevel2[i2] = count3++;
4912 o3 = decode[i] & 0x7F;
4913 i3 = 128*mlevel2[i2] + o3;
4914 mlevel3[i3] = i;
4915 }
4916 return result;
4917}
4918
4919static int
4920encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4921{
4922 struct encoding_map *map = (struct encoding_map*)mapping;
4923 int l1 = c>>11;
4924 int l2 = (c>>7) & 0xF;
4925 int l3 = c & 0x7F;
4926 int i;
4927
4928#ifdef Py_UNICODE_WIDE
4929 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004931 }
4932#endif
4933 if (c == 0)
4934 return 0;
4935 /* level 1*/
4936 i = map->level1[l1];
4937 if (i == 0xFF) {
4938 return -1;
4939 }
4940 /* level 2*/
4941 i = map->level23[16*i+l2];
4942 if (i == 0xFF) {
4943 return -1;
4944 }
4945 /* level 3 */
4946 i = map->level23[16*map->count2 + 128*i + l3];
4947 if (i == 0) {
4948 return -1;
4949 }
4950 return i;
4951}
4952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953/* Lookup the character ch in the mapping. If the character
4954 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004955 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957{
Christian Heimes217cfd12007-12-02 14:31:20 +00004958 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 PyObject *x;
4960
4961 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 x = PyObject_GetItem(mapping, w);
4964 Py_DECREF(w);
4965 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4967 /* No mapping found means: mapping is undefined. */
4968 PyErr_Clear();
4969 x = Py_None;
4970 Py_INCREF(x);
4971 return x;
4972 } else
4973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004975 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004977 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 long value = PyLong_AS_LONG(x);
4979 if (value < 0 || value > 255) {
4980 PyErr_SetString(PyExc_TypeError,
4981 "character mapping must be in range(256)");
4982 Py_DECREF(x);
4983 return NULL;
4984 }
4985 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004987 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 /* wrong return value */
4991 PyErr_Format(PyExc_TypeError,
4992 "character mapping must return integer, bytes or None, not %.400s",
4993 x->ob_type->tp_name);
4994 Py_DECREF(x);
4995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 }
4997}
4998
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004999static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005000charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005001{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005002 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5003 /* exponentially overallocate to minimize reallocations */
5004 if (requiredsize < 2*outsize)
5005 requiredsize = 2*outsize;
5006 if (_PyBytes_Resize(outobj, requiredsize))
5007 return -1;
5008 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005009}
5010
Benjamin Peterson14339b62009-01-31 16:36:08 +00005011typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005013}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005015 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 space is available. Return a new reference to the object that
5017 was put in the output buffer, or Py_None, if the mapping was undefined
5018 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005019 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005021charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005024 PyObject *rep;
5025 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005026 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027
Christian Heimes90aa7642007-12-19 02:45:37 +00005028 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005029 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005031 if (res == -1)
5032 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 if (outsize<requiredsize)
5034 if (charmapencode_resize(outobj, outpos, requiredsize))
5035 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005036 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 outstart[(*outpos)++] = (char)res;
5038 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005039 }
5040
5041 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005044 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 Py_DECREF(rep);
5046 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005047 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (PyLong_Check(rep)) {
5049 Py_ssize_t requiredsize = *outpos+1;
5050 if (outsize<requiredsize)
5051 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5052 Py_DECREF(rep);
5053 return enc_EXCEPTION;
5054 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005055 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 else {
5059 const char *repchars = PyBytes_AS_STRING(rep);
5060 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5061 Py_ssize_t requiredsize = *outpos+repsize;
5062 if (outsize<requiredsize)
5063 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5064 Py_DECREF(rep);
5065 return enc_EXCEPTION;
5066 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005067 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 memcpy(outstart + *outpos, repchars, repsize);
5069 *outpos += repsize;
5070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005072 Py_DECREF(rep);
5073 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074}
5075
5076/* handle an error in PyUnicode_EncodeCharmap
5077 Return 0 on success, -1 on error */
5078static
5079int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005080 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005082 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005083 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084{
5085 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005086 Py_ssize_t repsize;
5087 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 Py_UNICODE *uni2;
5089 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005090 Py_ssize_t collstartpos = *inpos;
5091 Py_ssize_t collendpos = *inpos+1;
5092 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 char *encoding = "charmap";
5094 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005095 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 /* find all unencodable characters */
5098 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005099 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005100 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 int res = encoding_map_lookup(p[collendpos], mapping);
5102 if (res != -1)
5103 break;
5104 ++collendpos;
5105 continue;
5106 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005107
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 rep = charmapencode_lookup(p[collendpos], mapping);
5109 if (rep==NULL)
5110 return -1;
5111 else if (rep!=Py_None) {
5112 Py_DECREF(rep);
5113 break;
5114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005115 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 }
5118 /* cache callback name lookup
5119 * (if not done yet, i.e. it's the first error) */
5120 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122 *known_errorHandler = 1;
5123 else if (!strcmp(errors, "replace"))
5124 *known_errorHandler = 2;
5125 else if (!strcmp(errors, "ignore"))
5126 *known_errorHandler = 3;
5127 else if (!strcmp(errors, "xmlcharrefreplace"))
5128 *known_errorHandler = 4;
5129 else
5130 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 }
5132 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133 case 1: /* strict */
5134 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5135 return -1;
5136 case 2: /* replace */
5137 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 x = charmapencode_output('?', mapping, res, respos);
5139 if (x==enc_EXCEPTION) {
5140 return -1;
5141 }
5142 else if (x==enc_FAILED) {
5143 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5144 return -1;
5145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005146 }
5147 /* fall through */
5148 case 3: /* ignore */
5149 *inpos = collendpos;
5150 break;
5151 case 4: /* xmlcharrefreplace */
5152 /* generate replacement (temporarily (mis)uses p) */
5153 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 char buffer[2+29+1+1];
5155 char *cp;
5156 sprintf(buffer, "&#%d;", (int)p[collpos]);
5157 for (cp = buffer; *cp; ++cp) {
5158 x = charmapencode_output(*cp, mapping, res, respos);
5159 if (x==enc_EXCEPTION)
5160 return -1;
5161 else if (x==enc_FAILED) {
5162 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5163 return -1;
5164 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005165 }
5166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005167 *inpos = collendpos;
5168 break;
5169 default:
5170 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 encoding, reason, p, size, exceptionObject,
5172 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005175 if (PyBytes_Check(repunicode)) {
5176 /* Directly copy bytes result to output. */
5177 Py_ssize_t outsize = PyBytes_Size(*res);
5178 Py_ssize_t requiredsize;
5179 repsize = PyBytes_Size(repunicode);
5180 requiredsize = *respos + repsize;
5181 if (requiredsize > outsize)
5182 /* Make room for all additional bytes. */
5183 if (charmapencode_resize(res, respos, requiredsize)) {
5184 Py_DECREF(repunicode);
5185 return -1;
5186 }
5187 memcpy(PyBytes_AsString(*res) + *respos,
5188 PyBytes_AsString(repunicode), repsize);
5189 *respos += repsize;
5190 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005191 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005192 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005194 /* generate replacement */
5195 repsize = PyUnicode_GET_SIZE(repunicode);
5196 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 x = charmapencode_output(*uni2, mapping, res, respos);
5198 if (x==enc_EXCEPTION) {
5199 return -1;
5200 }
5201 else if (x==enc_FAILED) {
5202 Py_DECREF(repunicode);
5203 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5204 return -1;
5205 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005206 }
5207 *inpos = newpos;
5208 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 }
5210 return 0;
5211}
5212
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 Py_ssize_t size,
5215 PyObject *mapping,
5216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 /* output object */
5219 PyObject *res = NULL;
5220 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005221 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005223 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005224 PyObject *errorHandler = NULL;
5225 PyObject *exc = NULL;
5226 /* the following variable is used for caching string comparisons
5227 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5228 * 3=ignore, 4=xmlcharrefreplace */
5229 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
5231 /* Default to Latin-1 */
5232 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 /* allocate enough for a simple encoding without
5236 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005237 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 if (res == NULL)
5239 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005240 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 /* try to encode it */
5245 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5246 if (x==enc_EXCEPTION) /* error */
5247 goto onError;
5248 if (x==enc_FAILED) { /* unencodable character */
5249 if (charmap_encoding_error(p, size, &inpos, mapping,
5250 &exc,
5251 &known_errorHandler, &errorHandler, errors,
5252 &res, &respos)) {
5253 goto onError;
5254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 else
5257 /* done with this character => adjust input position */
5258 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005261 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005262 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005263 if (_PyBytes_Resize(&res, respos) < 0)
5264 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 Py_XDECREF(exc);
5267 Py_XDECREF(errorHandler);
5268 return res;
5269
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 Py_XDECREF(res);
5272 Py_XDECREF(exc);
5273 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 return NULL;
5275}
5276
5277PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279{
5280 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 PyErr_BadArgument();
5282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 }
5284 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 PyUnicode_GET_SIZE(unicode),
5286 mapping,
5287 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288}
5289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290/* create or adjust a UnicodeTranslateError */
5291static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 const Py_UNICODE *unicode, Py_ssize_t size,
5293 Py_ssize_t startpos, Py_ssize_t endpos,
5294 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005297 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 }
5300 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5302 goto onError;
5303 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5304 goto onError;
5305 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5306 goto onError;
5307 return;
5308 onError:
5309 Py_DECREF(*exceptionObject);
5310 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 }
5312}
5313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314/* raises a UnicodeTranslateError */
5315static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 const Py_UNICODE *unicode, Py_ssize_t size,
5317 Py_ssize_t startpos, Py_ssize_t endpos,
5318 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319{
5320 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324}
5325
5326/* error handling callback helper:
5327 build arguments, call the callback and check the arguments,
5328 put the result into newpos and return the replacement string, which
5329 has to be freed by the caller */
5330static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 PyObject **errorHandler,
5332 const char *reason,
5333 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5334 Py_ssize_t startpos, Py_ssize_t endpos,
5335 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005337 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005339 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 PyObject *restuple;
5341 PyObject *resunicode;
5342
5343 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 }
5348
5349 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353
5354 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005359 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 Py_DECREF(restuple);
5361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362 }
5363 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 &resunicode, &i_newpos)) {
5365 Py_DECREF(restuple);
5366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370 else
5371 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005372 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5374 Py_DECREF(restuple);
5375 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 Py_INCREF(resunicode);
5378 Py_DECREF(restuple);
5379 return resunicode;
5380}
5381
5382/* Lookup the character ch in the mapping and put the result in result,
5383 which must be decrefed by the caller.
5384 Return 0 on success, -1 on error */
5385static
5386int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5387{
Christian Heimes217cfd12007-12-02 14:31:20 +00005388 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 PyObject *x;
5390
5391 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 x = PyObject_GetItem(mapping, w);
5394 Py_DECREF(w);
5395 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5397 /* No mapping found means: use 1:1 mapping. */
5398 PyErr_Clear();
5399 *result = NULL;
5400 return 0;
5401 } else
5402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 }
5404 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 *result = x;
5406 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005408 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 long value = PyLong_AS_LONG(x);
5410 long max = PyUnicode_GetMax();
5411 if (value < 0 || value > max) {
5412 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005413 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 Py_DECREF(x);
5415 return -1;
5416 }
5417 *result = x;
5418 return 0;
5419 }
5420 else if (PyUnicode_Check(x)) {
5421 *result = x;
5422 return 0;
5423 }
5424 else {
5425 /* wrong return value */
5426 PyErr_SetString(PyExc_TypeError,
5427 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005428 Py_DECREF(x);
5429 return -1;
5430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431}
5432/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 if not reallocate and adjust various state variables.
5434 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435static
Walter Dörwald4894c302003-10-24 14:25:28 +00005436int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005440 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 /* remember old output position */
5442 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5443 /* exponentially overallocate to minimize reallocations */
5444 if (requiredsize < 2 * oldsize)
5445 requiredsize = 2 * oldsize;
5446 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5447 return -1;
5448 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 }
5450 return 0;
5451}
5452/* lookup the character, put the result in the output string and adjust
5453 various state variables. Return a new reference to the object that
5454 was put in the output buffer in *result, or Py_None, if the mapping was
5455 undefined (in which case no character was written).
5456 The called must decref result.
5457 Return 0 on success, -1 on error. */
5458static
Walter Dörwald4894c302003-10-24 14:25:28 +00005459int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5461 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462{
Walter Dörwald4894c302003-10-24 14:25:28 +00005463 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 /* not found => default to 1:1 mapping */
5467 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 }
5469 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005471 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 /* no overflow check, because we know that the space is enough */
5473 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 }
5475 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5477 if (repsize==1) {
5478 /* no overflow check, because we know that the space is enough */
5479 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5480 }
5481 else if (repsize!=0) {
5482 /* more than one character */
5483 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5484 (insize - (curinp-startinp)) +
5485 repsize - 1;
5486 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5487 return -1;
5488 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5489 *outp += repsize;
5490 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 }
5492 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 return 0;
5495}
5496
5497PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 Py_ssize_t size,
5499 PyObject *mapping,
5500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 /* output object */
5503 PyObject *res = NULL;
5504 /* pointers to the beginning and end+1 of input */
5505 const Py_UNICODE *startp = p;
5506 const Py_UNICODE *endp = p + size;
5507 /* pointer into the output */
5508 Py_UNICODE *str;
5509 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005510 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 char *reason = "character maps to <undefined>";
5512 PyObject *errorHandler = NULL;
5513 PyObject *exc = NULL;
5514 /* the following variable is used for caching string comparisons
5515 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5516 * 3=ignore, 4=xmlcharrefreplace */
5517 int known_errorHandler = -1;
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 PyErr_BadArgument();
5521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523
5524 /* allocate enough for a simple 1:1 translation without
5525 replacements, if we need more, we'll resize */
5526 res = PyUnicode_FromUnicode(NULL, size);
5527 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 /* try to encode it */
5535 PyObject *x = NULL;
5536 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5537 Py_XDECREF(x);
5538 goto onError;
5539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005540 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 if (x!=Py_None) /* it worked => adjust input pointer */
5542 ++p;
5543 else { /* untranslatable character */
5544 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5545 Py_ssize_t repsize;
5546 Py_ssize_t newpos;
5547 Py_UNICODE *uni2;
5548 /* startpos for collecting untranslatable chars */
5549 const Py_UNICODE *collstart = p;
5550 const Py_UNICODE *collend = p+1;
5551 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 /* find all untranslatable characters */
5554 while (collend < endp) {
5555 if (charmaptranslate_lookup(*collend, mapping, &x))
5556 goto onError;
5557 Py_XDECREF(x);
5558 if (x!=Py_None)
5559 break;
5560 ++collend;
5561 }
5562 /* cache callback name lookup
5563 * (if not done yet, i.e. it's the first error) */
5564 if (known_errorHandler==-1) {
5565 if ((errors==NULL) || (!strcmp(errors, "strict")))
5566 known_errorHandler = 1;
5567 else if (!strcmp(errors, "replace"))
5568 known_errorHandler = 2;
5569 else if (!strcmp(errors, "ignore"))
5570 known_errorHandler = 3;
5571 else if (!strcmp(errors, "xmlcharrefreplace"))
5572 known_errorHandler = 4;
5573 else
5574 known_errorHandler = 0;
5575 }
5576 switch (known_errorHandler) {
5577 case 1: /* strict */
5578 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005579 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 case 2: /* replace */
5581 /* No need to check for space, this is a 1:1 replacement */
5582 for (coll = collstart; coll<collend; ++coll)
5583 *str++ = '?';
5584 /* fall through */
5585 case 3: /* ignore */
5586 p = collend;
5587 break;
5588 case 4: /* xmlcharrefreplace */
5589 /* generate replacement (temporarily (mis)uses p) */
5590 for (p = collstart; p < collend; ++p) {
5591 char buffer[2+29+1+1];
5592 char *cp;
5593 sprintf(buffer, "&#%d;", (int)*p);
5594 if (charmaptranslate_makespace(&res, &str,
5595 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5596 goto onError;
5597 for (cp = buffer; *cp; ++cp)
5598 *str++ = *cp;
5599 }
5600 p = collend;
5601 break;
5602 default:
5603 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5604 reason, startp, size, &exc,
5605 collstart-startp, collend-startp, &newpos);
5606 if (repunicode == NULL)
5607 goto onError;
5608 /* generate replacement */
5609 repsize = PyUnicode_GET_SIZE(repunicode);
5610 if (charmaptranslate_makespace(&res, &str,
5611 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5612 Py_DECREF(repunicode);
5613 goto onError;
5614 }
5615 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5616 *str++ = *uni2;
5617 p = startp + newpos;
5618 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005620 }
5621 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 /* Resize if we allocated to much */
5623 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005624 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 if (PyUnicode_Resize(&res, respos) < 0)
5626 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 }
5628 Py_XDECREF(exc);
5629 Py_XDECREF(errorHandler);
5630 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 Py_XDECREF(res);
5634 Py_XDECREF(exc);
5635 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 return NULL;
5637}
5638
5639PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 PyObject *mapping,
5641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642{
5643 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005644
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 str = PyUnicode_FromObject(str);
5646 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 PyUnicode_GET_SIZE(str),
5650 mapping,
5651 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 Py_DECREF(str);
5653 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005654
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 Py_XDECREF(str);
5657 return NULL;
5658}
Tim Petersced69f82003-09-16 20:30:58 +00005659
Guido van Rossum9e896b32000-04-05 20:11:21 +00005660/* --- Decimal Encoder ---------------------------------------------------- */
5661
5662int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 Py_ssize_t length,
5664 char *output,
5665 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005666{
5667 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 PyObject *errorHandler = NULL;
5669 PyObject *exc = NULL;
5670 const char *encoding = "decimal";
5671 const char *reason = "invalid decimal Unicode string";
5672 /* the following variable is used for caching string comparisons
5673 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5674 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005675
5676 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 PyErr_BadArgument();
5678 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005679 }
5680
5681 p = s;
5682 end = s + length;
5683 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 register Py_UNICODE ch = *p;
5685 int decimal;
5686 PyObject *repunicode;
5687 Py_ssize_t repsize;
5688 Py_ssize_t newpos;
5689 Py_UNICODE *uni2;
5690 Py_UNICODE *collstart;
5691 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005692
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005694 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 ++p;
5696 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 decimal = Py_UNICODE_TODECIMAL(ch);
5699 if (decimal >= 0) {
5700 *output++ = '0' + decimal;
5701 ++p;
5702 continue;
5703 }
5704 if (0 < ch && ch < 256) {
5705 *output++ = (char)ch;
5706 ++p;
5707 continue;
5708 }
5709 /* All other characters are considered unencodable */
5710 collstart = p;
5711 collend = p+1;
5712 while (collend < end) {
5713 if ((0 < *collend && *collend < 256) ||
5714 !Py_UNICODE_ISSPACE(*collend) ||
5715 Py_UNICODE_TODECIMAL(*collend))
5716 break;
5717 }
5718 /* cache callback name lookup
5719 * (if not done yet, i.e. it's the first error) */
5720 if (known_errorHandler==-1) {
5721 if ((errors==NULL) || (!strcmp(errors, "strict")))
5722 known_errorHandler = 1;
5723 else if (!strcmp(errors, "replace"))
5724 known_errorHandler = 2;
5725 else if (!strcmp(errors, "ignore"))
5726 known_errorHandler = 3;
5727 else if (!strcmp(errors, "xmlcharrefreplace"))
5728 known_errorHandler = 4;
5729 else
5730 known_errorHandler = 0;
5731 }
5732 switch (known_errorHandler) {
5733 case 1: /* strict */
5734 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5735 goto onError;
5736 case 2: /* replace */
5737 for (p = collstart; p < collend; ++p)
5738 *output++ = '?';
5739 /* fall through */
5740 case 3: /* ignore */
5741 p = collend;
5742 break;
5743 case 4: /* xmlcharrefreplace */
5744 /* generate replacement (temporarily (mis)uses p) */
5745 for (p = collstart; p < collend; ++p)
5746 output += sprintf(output, "&#%d;", (int)*p);
5747 p = collend;
5748 break;
5749 default:
5750 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5751 encoding, reason, s, length, &exc,
5752 collstart-s, collend-s, &newpos);
5753 if (repunicode == NULL)
5754 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005755 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005756 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005757 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5758 Py_DECREF(repunicode);
5759 goto onError;
5760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* generate replacement */
5762 repsize = PyUnicode_GET_SIZE(repunicode);
5763 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5764 Py_UNICODE ch = *uni2;
5765 if (Py_UNICODE_ISSPACE(ch))
5766 *output++ = ' ';
5767 else {
5768 decimal = Py_UNICODE_TODECIMAL(ch);
5769 if (decimal >= 0)
5770 *output++ = '0' + decimal;
5771 else if (0 < ch && ch < 256)
5772 *output++ = (char)ch;
5773 else {
5774 Py_DECREF(repunicode);
5775 raise_encode_exception(&exc, encoding,
5776 s, length, collstart-s, collend-s, reason);
5777 goto onError;
5778 }
5779 }
5780 }
5781 p = s + newpos;
5782 Py_DECREF(repunicode);
5783 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005784 }
5785 /* 0-terminate the output string */
5786 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 Py_XDECREF(exc);
5788 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005789 return 0;
5790
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 Py_XDECREF(exc);
5793 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005794 return -1;
5795}
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797/* --- Helpers ------------------------------------------------------------ */
5798
Eric Smith8c663262007-08-25 02:26:07 +00005799#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005800#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005801#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005802/* Include _ParseTupleFinds from find.h */
5803#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005804#include "stringlib/find.h"
5805#include "stringlib/partition.h"
5806
Eric Smith5807c412008-05-11 21:00:57 +00005807#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005808#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005809#include "stringlib/localeutil.h"
5810
Thomas Wouters477c8d52006-05-27 19:21:47 +00005811/* helper macro to fixup start/end slice values */
5812#define FIX_START_END(obj) \
5813 if (start < 0) \
5814 start += (obj)->length; \
5815 if (start < 0) \
5816 start = 0; \
5817 if (end > (obj)->length) \
5818 end = (obj)->length; \
5819 if (end < 0) \
5820 end += (obj)->length; \
5821 if (end < 0) \
5822 end = 0;
5823
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 PyObject *substr,
5826 Py_ssize_t start,
5827 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830 PyUnicodeObject* str_obj;
5831 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005832
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5834 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5837 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 Py_DECREF(str_obj);
5839 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
Tim Petersced69f82003-09-16 20:30:58 +00005841
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005843
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844 result = stringlib_count(
5845 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5846 );
5847
5848 Py_DECREF(sub_obj);
5849 Py_DECREF(str_obj);
5850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 return result;
5852}
5853
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005855 PyObject *sub,
5856 Py_ssize_t start,
5857 Py_ssize_t end,
5858 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005860 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005865 sub = PyUnicode_FromObject(sub);
5866 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 Py_DECREF(str);
5868 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 }
Tim Petersced69f82003-09-16 20:30:58 +00005870
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 if (direction > 0)
5872 result = stringlib_find_slice(
5873 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5874 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5875 start, end
5876 );
5877 else
5878 result = stringlib_rfind_slice(
5879 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5880 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5881 start, end
5882 );
5883
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005885 Py_DECREF(sub);
5886
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 return result;
5888}
5889
Tim Petersced69f82003-09-16 20:30:58 +00005890static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 PyUnicodeObject *substring,
5893 Py_ssize_t start,
5894 Py_ssize_t end,
5895 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 if (substring->length == 0)
5898 return 1;
5899
Thomas Wouters477c8d52006-05-27 19:21:47 +00005900 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901
5902 end -= substring->length;
5903 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
5906 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 if (Py_UNICODE_MATCH(self, end, substring))
5908 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 } else {
5910 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 }
5913
5914 return 0;
5915}
5916
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 PyObject *substr,
5919 Py_ssize_t start,
5920 Py_ssize_t end,
5921 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005924
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 str = PyUnicode_FromObject(str);
5926 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 substr = PyUnicode_FromObject(substr);
5929 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 Py_DECREF(str);
5931 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
Tim Petersced69f82003-09-16 20:30:58 +00005933
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 (PyUnicodeObject *)substr,
5936 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 Py_DECREF(str);
5938 Py_DECREF(substr);
5939 return result;
5940}
5941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942/* Apply fixfct filter to the Unicode object self and return a
5943 reference to the modified object */
5944
Tim Petersced69f82003-09-16 20:30:58 +00005945static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
5949
5950 PyUnicodeObject *u;
5951
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005952 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005955
5956 Py_UNICODE_COPY(u->str, self->str, self->length);
5957
Tim Peters7a29bd52001-09-12 03:03:31 +00005958 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 /* fixfct should return TRUE if it modified the buffer. If
5960 FALSE, return a reference to the original buffer instead
5961 (to save space, not time) */
5962 Py_INCREF(self);
5963 Py_DECREF(u);
5964 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 }
5966 return (PyObject*) u;
5967}
5968
Tim Petersced69f82003-09-16 20:30:58 +00005969static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970int fixupper(PyUnicodeObject *self)
5971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005972 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 Py_UNICODE *s = self->str;
5974 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005978
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 ch = Py_UNICODE_TOUPPER(*s);
5980 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 *s = ch;
5983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 s++;
5985 }
5986
5987 return status;
5988}
5989
Tim Petersced69f82003-09-16 20:30:58 +00005990static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991int fixlower(PyUnicodeObject *self)
5992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005993 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 Py_UNICODE *s = self->str;
5995 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005996
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 ch = Py_UNICODE_TOLOWER(*s);
6001 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 *s = ch;
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 s++;
6006 }
6007
6008 return status;
6009}
6010
Tim Petersced69f82003-09-16 20:30:58 +00006011static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012int fixswapcase(PyUnicodeObject *self)
6013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006014 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 Py_UNICODE *s = self->str;
6016 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 while (len-- > 0) {
6019 if (Py_UNICODE_ISUPPER(*s)) {
6020 *s = Py_UNICODE_TOLOWER(*s);
6021 status = 1;
6022 } else if (Py_UNICODE_ISLOWER(*s)) {
6023 *s = Py_UNICODE_TOUPPER(*s);
6024 status = 1;
6025 }
6026 s++;
6027 }
6028
6029 return status;
6030}
6031
Tim Petersced69f82003-09-16 20:30:58 +00006032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033int fixcapitalize(PyUnicodeObject *self)
6034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006036 Py_UNICODE *s = self->str;
6037 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006038
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006039 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006041 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 *s = Py_UNICODE_TOUPPER(*s);
6043 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006045 s++;
6046 while (--len > 0) {
6047 if (Py_UNICODE_ISUPPER(*s)) {
6048 *s = Py_UNICODE_TOLOWER(*s);
6049 status = 1;
6050 }
6051 s++;
6052 }
6053 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
6056static
6057int fixtitle(PyUnicodeObject *self)
6058{
6059 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6060 register Py_UNICODE *e;
6061 int previous_is_cased;
6062
6063 /* Shortcut for single character strings */
6064 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6066 if (*p != ch) {
6067 *p = ch;
6068 return 1;
6069 }
6070 else
6071 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 }
Tim Petersced69f82003-09-16 20:30:58 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 e = p + PyUnicode_GET_SIZE(self);
6075 previous_is_cased = 0;
6076 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006078
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 if (previous_is_cased)
6080 *p = Py_UNICODE_TOLOWER(ch);
6081 else
6082 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006083
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 if (Py_UNICODE_ISLOWER(ch) ||
6085 Py_UNICODE_ISUPPER(ch) ||
6086 Py_UNICODE_ISTITLE(ch))
6087 previous_is_cased = 1;
6088 else
6089 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
6091 return 1;
6092}
6093
Tim Peters8ce9f162004-08-27 01:49:32 +00006094PyObject *
6095PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096{
Skip Montanaro6543b452004-09-16 03:28:13 +00006097 const Py_UNICODE blank = ' ';
6098 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006099 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006100 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006101 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6102 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006103 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6104 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006105 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006106 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
Tim Peters05eba1f2004-08-27 21:32:02 +00006108 fseq = PySequence_Fast(seq, "");
6109 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006110 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006111 }
6112
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006113 /* NOTE: the following code can't call back into Python code,
6114 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006115 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006116
Tim Peters05eba1f2004-08-27 21:32:02 +00006117 seqlen = PySequence_Fast_GET_SIZE(fseq);
6118 /* If empty sequence, return u"". */
6119 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006120 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6121 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006122 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006123 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006124 /* If singleton sequence with an exact Unicode, return that. */
6125 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 item = items[0];
6127 if (PyUnicode_CheckExact(item)) {
6128 Py_INCREF(item);
6129 res = (PyUnicodeObject *)item;
6130 goto Done;
6131 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006132 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006133 else {
6134 /* Set up sep and seplen */
6135 if (separator == NULL) {
6136 sep = &blank;
6137 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006138 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006139 else {
6140 if (!PyUnicode_Check(separator)) {
6141 PyErr_Format(PyExc_TypeError,
6142 "separator: expected str instance,"
6143 " %.80s found",
6144 Py_TYPE(separator)->tp_name);
6145 goto onError;
6146 }
6147 sep = PyUnicode_AS_UNICODE(separator);
6148 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006149 }
6150 }
6151
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006152 /* There are at least two things to join, or else we have a subclass
6153 * of str in the sequence.
6154 * Do a pre-pass to figure out the total amount of space we'll
6155 * need (sz), and see whether all argument are strings.
6156 */
6157 sz = 0;
6158 for (i = 0; i < seqlen; i++) {
6159 const Py_ssize_t old_sz = sz;
6160 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 if (!PyUnicode_Check(item)) {
6162 PyErr_Format(PyExc_TypeError,
6163 "sequence item %zd: expected str instance,"
6164 " %.80s found",
6165 i, Py_TYPE(item)->tp_name);
6166 goto onError;
6167 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006168 sz += PyUnicode_GET_SIZE(item);
6169 if (i != 0)
6170 sz += seplen;
6171 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6172 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006174 goto onError;
6175 }
6176 }
Tim Petersced69f82003-09-16 20:30:58 +00006177
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006178 res = _PyUnicode_New(sz);
6179 if (res == NULL)
6180 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006181
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006182 /* Catenate everything. */
6183 res_p = PyUnicode_AS_UNICODE(res);
6184 for (i = 0; i < seqlen; ++i) {
6185 Py_ssize_t itemlen;
6186 item = items[i];
6187 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 /* Copy item, and maybe the separator. */
6189 if (i) {
6190 Py_UNICODE_COPY(res_p, sep, seplen);
6191 res_p += seplen;
6192 }
6193 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6194 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006195 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006196
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006198 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 return (PyObject *)res;
6200
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006202 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006203 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 return NULL;
6205}
6206
Tim Petersced69f82003-09-16 20:30:58 +00006207static
6208PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 Py_ssize_t left,
6210 Py_ssize_t right,
6211 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212{
6213 PyUnicodeObject *u;
6214
6215 if (left < 0)
6216 left = 0;
6217 if (right < 0)
6218 right = 0;
6219
Tim Peters7a29bd52001-09-12 03:03:31 +00006220 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 Py_INCREF(self);
6222 return self;
6223 }
6224
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006225 if (left > PY_SSIZE_T_MAX - self->length ||
6226 right > PY_SSIZE_T_MAX - (left + self->length)) {
6227 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6228 return NULL;
6229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 u = _PyUnicode_New(left + self->length + right);
6231 if (u) {
6232 if (left)
6233 Py_UNICODE_FILL(u->str, fill, left);
6234 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6235 if (right)
6236 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6237 }
6238
6239 return u;
6240}
6241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242#define SPLIT_APPEND(data, left, right) \
6243 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6244 if (!str) \
6245 goto onError; \
6246 if (PyList_Append(list, str)) { \
6247 Py_DECREF(str); \
6248 goto onError; \
6249 } \
6250 else \
6251 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252
6253static
6254PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 PyObject *list,
6256 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006258 register Py_ssize_t i;
6259 register Py_ssize_t j;
6260 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006262 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263
6264 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006266 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006268 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6270 i++;
6271 if (j < i) {
6272 if (maxcount-- <= 0)
6273 break;
6274 SPLIT_APPEND(buf, j, i);
6275 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6276 i++;
6277 j = i;
6278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 }
6280 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 }
6283 return list;
6284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 Py_DECREF(list);
6287 return NULL;
6288}
6289
6290PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006293 register Py_ssize_t i;
6294 register Py_ssize_t j;
6295 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 PyObject *list;
6297 PyObject *str;
6298 Py_UNICODE *data;
6299
6300 string = PyUnicode_FromObject(string);
6301 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 data = PyUnicode_AS_UNICODE(string);
6304 len = PyUnicode_GET_SIZE(string);
6305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 list = PyList_New(0);
6307 if (!list)
6308 goto onError;
6309
6310 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006312
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 /* Find a line and append it */
6314 while (i < len && !BLOOM_LINEBREAK(data[i]))
6315 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006318 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 if (i < len) {
6320 if (data[i] == '\r' && i + 1 < len &&
6321 data[i+1] == '\n')
6322 i += 2;
6323 else
6324 i++;
6325 if (keepends)
6326 eol = i;
6327 }
6328 SPLIT_APPEND(data, j, eol);
6329 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 }
6331 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 }
6334
6335 Py_DECREF(string);
6336 return list;
6337
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006339 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 Py_DECREF(string);
6341 return NULL;
6342}
6343
Tim Petersced69f82003-09-16 20:30:58 +00006344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 PyObject *list,
6347 Py_UNICODE ch,
6348 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006350 register Py_ssize_t i;
6351 register Py_ssize_t j;
6352 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006354 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
6356 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 if (buf[i] == ch) {
6358 if (maxcount-- <= 0)
6359 break;
6360 SPLIT_APPEND(buf, j, i);
6361 i = j = i + 1;
6362 } else
6363 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 }
6365 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 }
6368 return list;
6369
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 Py_DECREF(list);
6372 return NULL;
6373}
6374
Tim Petersced69f82003-09-16 20:30:58 +00006375static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 PyObject *list,
6378 PyUnicodeObject *substring,
6379 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006381 register Py_ssize_t i;
6382 register Py_ssize_t j;
6383 Py_ssize_t len = self->length;
6384 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 PyObject *str;
6386
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006387 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 if (Py_UNICODE_MATCH(self, i, substring)) {
6389 if (maxcount-- <= 0)
6390 break;
6391 SPLIT_APPEND(self->str, j, i);
6392 i = j = i + sublen;
6393 } else
6394 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
6396 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 }
6399 return list;
6400
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 Py_DECREF(list);
6403 return NULL;
6404}
6405
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006406static
6407PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 PyObject *list,
6409 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006411 register Py_ssize_t i;
6412 register Py_ssize_t j;
6413 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006414 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006415 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006416
6417 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006419 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006421 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6423 i--;
6424 if (j > i) {
6425 if (maxcount-- <= 0)
6426 break;
6427 SPLIT_APPEND(buf, i + 1, j + 1);
6428 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6429 i--;
6430 j = i;
6431 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006432 }
6433 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006435 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 if (PyList_Reverse(list) < 0)
6437 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006438 return list;
6439
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006441 Py_DECREF(list);
6442 return NULL;
6443}
6444
Benjamin Peterson14339b62009-01-31 16:36:08 +00006445static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 PyObject *list,
6448 Py_UNICODE ch,
6449 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006450{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451 register Py_ssize_t i;
6452 register Py_ssize_t j;
6453 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006454 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006455 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006456
6457 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 if (buf[i] == ch) {
6459 if (maxcount-- <= 0)
6460 break;
6461 SPLIT_APPEND(buf, i + 1, j + 1);
6462 j = i = i - 1;
6463 } else
6464 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006465 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006466 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006468 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 if (PyList_Reverse(list) < 0)
6470 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006471 return list;
6472
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006474 Py_DECREF(list);
6475 return NULL;
6476}
6477
Benjamin Peterson14339b62009-01-31 16:36:08 +00006478static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 PyObject *list,
6481 PyUnicodeObject *substring,
6482 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006483{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006484 register Py_ssize_t i;
6485 register Py_ssize_t j;
6486 Py_ssize_t len = self->length;
6487 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006488 PyObject *str;
6489
6490 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 if (Py_UNICODE_MATCH(self, i, substring)) {
6492 if (maxcount-- <= 0)
6493 break;
6494 SPLIT_APPEND(self->str, i + sublen, j);
6495 j = i;
6496 i -= sublen;
6497 } else
6498 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006499 }
6500 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006502 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 if (PyList_Reverse(list) < 0)
6504 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006505 return list;
6506
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006508 Py_DECREF(list);
6509 return NULL;
6510}
6511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512#undef SPLIT_APPEND
6513
6514static
6515PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 PyUnicodeObject *substring,
6517 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
6519 PyObject *list;
6520
6521 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006522 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524 list = PyList_New(0);
6525 if (!list)
6526 return NULL;
6527
6528 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
6531 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 Py_DECREF(list);
6536 PyErr_SetString(PyExc_ValueError, "empty separator");
6537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 }
6539 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541}
6542
Tim Petersced69f82003-09-16 20:30:58 +00006543static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006544PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 PyUnicodeObject *substring,
6546 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006547{
6548 PyObject *list;
6549
6550 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006551 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006552
6553 list = PyList_New(0);
6554 if (!list)
6555 return NULL;
6556
6557 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006559
6560 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006562
6563 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 Py_DECREF(list);
6565 PyErr_SetString(PyExc_ValueError, "empty separator");
6566 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006567 }
6568 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006570}
6571
6572static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 PyUnicodeObject *str1,
6575 PyUnicodeObject *str2,
6576 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577{
6578 PyUnicodeObject *u;
6579
6580 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583 if (str1->length == str2->length) {
6584 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006585 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 if (str1->length == 1) {
6587 /* replace characters */
6588 Py_UNICODE u1, u2;
6589 if (!findchar(self->str, self->length, str1->str[0]))
6590 goto nothing;
6591 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6592 if (!u)
6593 return NULL;
6594 Py_UNICODE_COPY(u->str, self->str, self->length);
6595 u1 = str1->str[0];
6596 u2 = str2->str[0];
6597 for (i = 0; i < u->length; i++)
6598 if (u->str[i] == u1) {
6599 if (--maxcount < 0)
6600 break;
6601 u->str[i] = u2;
6602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006604 i = fastsearch(
6605 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006607 if (i < 0)
6608 goto nothing;
6609 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6610 if (!u)
6611 return NULL;
6612 Py_UNICODE_COPY(u->str, self->str, self->length);
6613 while (i <= self->length - str1->length)
6614 if (Py_UNICODE_MATCH(self, i, str1)) {
6615 if (--maxcount < 0)
6616 break;
6617 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6618 i += str1->length;
6619 } else
6620 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006623
6624 Py_ssize_t n, i, j, e;
6625 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 Py_UNICODE *p;
6627
6628 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006629 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 if (n > maxcount)
6631 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006632 if (n == 0)
6633 goto nothing;
6634 /* new_size = self->length + n * (str2->length - str1->length)); */
6635 delta = (str2->length - str1->length);
6636 if (delta == 0) {
6637 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006639 product = n * (str2->length - str1->length);
6640 if ((product / (str2->length - str1->length)) != n) {
6641 PyErr_SetString(PyExc_OverflowError,
6642 "replace string is too long");
6643 return NULL;
6644 }
6645 new_size = self->length + product;
6646 if (new_size < 0) {
6647 PyErr_SetString(PyExc_OverflowError,
6648 "replace string is too long");
6649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 }
6651 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006652 u = _PyUnicode_New(new_size);
6653 if (!u)
6654 return NULL;
6655 i = 0;
6656 p = u->str;
6657 e = self->length - str1->length;
6658 if (str1->length > 0) {
6659 while (n-- > 0) {
6660 /* look for next match */
6661 j = i;
6662 while (j <= e) {
6663 if (Py_UNICODE_MATCH(self, j, str1))
6664 break;
6665 j++;
6666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668 if (j > e)
6669 break;
6670 /* copy unchanged part [i:j] */
6671 Py_UNICODE_COPY(p, self->str+i, j-i);
6672 p += j - i;
6673 }
6674 /* copy substitution string */
6675 if (str2->length > 0) {
6676 Py_UNICODE_COPY(p, str2->str, str2->length);
6677 p += str2->length;
6678 }
6679 i = j + str1->length;
6680 }
6681 if (i < self->length)
6682 /* copy tail [i:] */
6683 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6684 } else {
6685 /* interleave */
6686 while (n > 0) {
6687 Py_UNICODE_COPY(p, str2->str, str2->length);
6688 p += str2->length;
6689 if (--n <= 0)
6690 break;
6691 *p++ = self->str[i++];
6692 }
6693 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006697
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006699 /* nothing to replace; return original string (when possible) */
6700 if (PyUnicode_CheckExact(self)) {
6701 Py_INCREF(self);
6702 return (PyObject *) self;
6703 }
6704 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705}
6706
6707/* --- Unicode Object Methods --------------------------------------------- */
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
6712Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 return fixup(self, fixtitle);
6719}
6720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723\n\
6724Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
6727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006728unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 return fixup(self, fixcapitalize);
6731}
6732
6733#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736\n\
6737Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
6740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006741unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742{
6743 PyObject *list;
6744 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 /* Split into words */
6748 list = split(self, NULL, -1);
6749 if (!list)
6750 return NULL;
6751
6752 /* Capitalize each word */
6753 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6754 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (item == NULL)
6757 goto onError;
6758 Py_DECREF(PyList_GET_ITEM(list, i));
6759 PyList_SET_ITEM(list, i, item);
6760 }
6761
6762 /* Join the words to form a new string */
6763 item = PyUnicode_Join(NULL, list);
6764
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 Py_DECREF(list);
6767 return (PyObject *)item;
6768}
6769#endif
6770
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006771/* Argument converter. Coerces to a single unicode character */
6772
6773static int
6774convert_uc(PyObject *obj, void *addr)
6775{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006776 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6777 PyObject *uniobj;
6778 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006779
Benjamin Peterson14339b62009-01-31 16:36:08 +00006780 uniobj = PyUnicode_FromObject(obj);
6781 if (uniobj == NULL) {
6782 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 return 0;
6785 }
6786 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6787 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006789 Py_DECREF(uniobj);
6790 return 0;
6791 }
6792 unistr = PyUnicode_AS_UNICODE(uniobj);
6793 *fillcharloc = unistr[0];
6794 Py_DECREF(uniobj);
6795 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006801Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006802done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject *
6805unicode_center(PyUnicodeObject *self, PyObject *args)
6806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006807 Py_ssize_t marg, left;
6808 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006809 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
Thomas Woutersde017742006-02-16 19:34:37 +00006811 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 return NULL;
6813
Tim Peters7a29bd52001-09-12 03:03:31 +00006814 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 Py_INCREF(self);
6816 return (PyObject*) self;
6817 }
6818
6819 marg = width - self->length;
6820 left = marg / 2 + (marg & width & 1);
6821
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006822 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Marc-André Lemburge5034372000-08-08 08:04:29 +00006825#if 0
6826
6827/* This code should go into some future Unicode collation support
6828 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006829 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006830
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006831/* speedy UTF-16 code point order comparison */
6832/* gleaned from: */
6833/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6834
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006835static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006836{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006837 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006838 0, 0, 0, 0, 0, 0, 0, 0,
6839 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006840 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006841};
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843static int
6844unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6845{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006846 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 Py_UNICODE *s1 = str1->str;
6849 Py_UNICODE *s2 = str2->str;
6850
6851 len1 = str1->length;
6852 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006855 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006856
6857 c1 = *s1++;
6858 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006859
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 if (c1 > (1<<11) * 26)
6861 c1 += utf16Fixup[c1>>11];
6862 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006863 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006864 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006865
6866 if (c1 != c2)
6867 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006868
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006869 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
6871
6872 return (len1 < len2) ? -1 : (len1 != len2);
6873}
6874
Marc-André Lemburge5034372000-08-08 08:04:29 +00006875#else
6876
6877static int
6878unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6879{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006880 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006881
6882 Py_UNICODE *s1 = str1->str;
6883 Py_UNICODE *s2 = str2->str;
6884
6885 len1 = str1->length;
6886 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Marc-André Lemburge5034372000-08-08 08:04:29 +00006888 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006889 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006890
Fredrik Lundh45714e92001-06-26 16:39:36 +00006891 c1 = *s1++;
6892 c2 = *s2++;
6893
6894 if (c1 != c2)
6895 return (c1 < c2) ? -1 : 1;
6896
Marc-André Lemburge5034372000-08-08 08:04:29 +00006897 len1--; len2--;
6898 }
6899
6900 return (len1 < len2) ? -1 : (len1 != len2);
6901}
6902
6903#endif
6904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006908 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6909 return unicode_compare((PyUnicodeObject *)left,
6910 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006911 PyErr_Format(PyExc_TypeError,
6912 "Can't compare %.100s and %.100s",
6913 left->ob_type->tp_name,
6914 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 return -1;
6916}
6917
Martin v. Löwis5b222132007-06-10 09:51:05 +00006918int
6919PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6920{
6921 int i;
6922 Py_UNICODE *id;
6923 assert(PyUnicode_Check(uni));
6924 id = PyUnicode_AS_UNICODE(uni);
6925 /* Compare Unicode string and source character set string */
6926 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 if (id[i] != str[i])
6928 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006929 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006931 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006933 return 0;
6934}
6935
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006936
Benjamin Peterson29060642009-01-31 22:14:21 +00006937#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006939
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006940PyObject *PyUnicode_RichCompare(PyObject *left,
6941 PyObject *right,
6942 int op)
6943{
6944 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006945
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006946 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6947 PyObject *v;
6948 if (((PyUnicodeObject *) left)->length !=
6949 ((PyUnicodeObject *) right)->length) {
6950 if (op == Py_EQ) {
6951 Py_INCREF(Py_False);
6952 return Py_False;
6953 }
6954 if (op == Py_NE) {
6955 Py_INCREF(Py_True);
6956 return Py_True;
6957 }
6958 }
6959 if (left == right)
6960 result = 0;
6961 else
6962 result = unicode_compare((PyUnicodeObject *)left,
6963 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006965 /* Convert the return value to a Boolean */
6966 switch (op) {
6967 case Py_EQ:
6968 v = TEST_COND(result == 0);
6969 break;
6970 case Py_NE:
6971 v = TEST_COND(result != 0);
6972 break;
6973 case Py_LE:
6974 v = TEST_COND(result <= 0);
6975 break;
6976 case Py_GE:
6977 v = TEST_COND(result >= 0);
6978 break;
6979 case Py_LT:
6980 v = TEST_COND(result == -1);
6981 break;
6982 case Py_GT:
6983 v = TEST_COND(result == 1);
6984 break;
6985 default:
6986 PyErr_BadArgument();
6987 return NULL;
6988 }
6989 Py_INCREF(v);
6990 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006991 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006993 Py_INCREF(Py_NotImplemented);
6994 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006995}
6996
Guido van Rossum403d68b2000-03-13 15:55:09 +00006997int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006999{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007000 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007001 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007002
7003 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007004 sub = PyUnicode_FromObject(element);
7005 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 PyErr_Format(PyExc_TypeError,
7007 "'in <string>' requires string as left operand, not %s",
7008 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007010 }
7011
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 str = PyUnicode_FromObject(container);
7013 if (!str) {
7014 Py_DECREF(sub);
7015 return -1;
7016 }
7017
7018 result = stringlib_contains_obj(str, sub);
7019
7020 Py_DECREF(str);
7021 Py_DECREF(sub);
7022
Guido van Rossum403d68b2000-03-13 15:55:09 +00007023 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007024}
7025
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026/* Concat to string or Unicode object giving a new Unicode object. */
7027
7028PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
7031 PyUnicodeObject *u = NULL, *v = NULL, *w;
7032
7033 /* Coerce the two arguments */
7034 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7035 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7038 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041 /* Shortcuts */
7042 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 Py_DECREF(v);
7044 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 }
7046 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 Py_DECREF(u);
7048 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
7050
7051 /* Concat the two Unicode strings */
7052 w = _PyUnicode_New(u->length + v->length);
7053 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 Py_UNICODE_COPY(w->str, u->str, u->length);
7056 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7057
7058 Py_DECREF(u);
7059 Py_DECREF(v);
7060 return (PyObject *)w;
7061
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 Py_XDECREF(u);
7064 Py_XDECREF(v);
7065 return NULL;
7066}
7067
Walter Dörwald1ab83302007-05-18 17:15:44 +00007068void
7069PyUnicode_Append(PyObject **pleft, PyObject *right)
7070{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007071 PyObject *new;
7072 if (*pleft == NULL)
7073 return;
7074 if (right == NULL || !PyUnicode_Check(*pleft)) {
7075 Py_DECREF(*pleft);
7076 *pleft = NULL;
7077 return;
7078 }
7079 new = PyUnicode_Concat(*pleft, right);
7080 Py_DECREF(*pleft);
7081 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007082}
7083
7084void
7085PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7086{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007087 PyUnicode_Append(pleft, right);
7088 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007089}
7090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007094Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007095string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097
7098static PyObject *
7099unicode_count(PyUnicodeObject *self, PyObject *args)
7100{
7101 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007102 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007103 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 PyObject *result;
7105
Guido van Rossumb8872e62000-05-09 14:14:27 +00007106 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 return NULL;
7109
7110 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007114
Thomas Wouters477c8d52006-05-27 19:21:47 +00007115 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
Christian Heimes217cfd12007-12-02 14:31:20 +00007117 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007118 stringlib_count(self->str + start, end - start,
7119 substring->str, substring->length)
7120 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007123
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 return result;
7125}
7126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007130Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007131to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007132handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7134'xmlcharrefreplace' as well as any other name registered with\n\
7135codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
7137static PyObject *
7138unicode_encode(PyUnicodeObject *self, PyObject *args)
7139{
7140 char *encoding = NULL;
7141 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007142 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007143
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7145 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007146 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007147 if (v == NULL)
7148 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007149 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007150 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007151 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007152 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007153 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007154 Py_DECREF(v);
7155 return NULL;
7156 }
7157 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007158
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007160 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007161}
7162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007163PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165\n\
7166Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
7169static PyObject*
7170unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7171{
7172 Py_UNICODE *e;
7173 Py_UNICODE *p;
7174 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007175 Py_UNICODE *qe;
7176 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 PyUnicodeObject *u;
7178 int tabsize = 8;
7179
7180 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
Thomas Wouters7e474022000-07-16 12:04:32 +00007183 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007184 i = 0; /* chars up to and including most recent \n or \r */
7185 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7186 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 for (p = self->str; p < e; p++)
7188 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 if (tabsize > 0) {
7190 incr = tabsize - (j % tabsize); /* cannot overflow */
7191 if (j > PY_SSIZE_T_MAX - incr)
7192 goto overflow1;
7193 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 if (j > PY_SSIZE_T_MAX - 1)
7198 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 j++;
7200 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 if (i > PY_SSIZE_T_MAX - j)
7202 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007204 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 }
7206 }
7207
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007208 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 /* Second pass: create output string and fill it */
7212 u = _PyUnicode_New(i + j);
7213 if (!u)
7214 return NULL;
7215
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007216 j = 0; /* same as in first pass */
7217 q = u->str; /* next output char */
7218 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
7220 for (p = self->str; p < e; p++)
7221 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 if (tabsize > 0) {
7223 i = tabsize - (j % tabsize);
7224 j += i;
7225 while (i--) {
7226 if (q >= qe)
7227 goto overflow2;
7228 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007229 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007231 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 else {
7233 if (q >= qe)
7234 goto overflow2;
7235 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007236 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 if (*p == '\n' || *p == '\r')
7238 j = 0;
7239 }
7240
7241 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007242
7243 overflow2:
7244 Py_DECREF(u);
7245 overflow1:
7246 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248}
7249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007250PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252\n\
7253Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007254such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255arguments start and end are interpreted as in slice notation.\n\
7256\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007257Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258
7259static PyObject *
7260unicode_find(PyUnicodeObject *self, PyObject *args)
7261{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007262 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007263 Py_ssize_t start;
7264 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007265 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266
Christian Heimes9cd17752007-11-18 19:35:23 +00007267 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
Thomas Wouters477c8d52006-05-27 19:21:47 +00007270 result = stringlib_find_slice(
7271 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7272 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7273 start, end
7274 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
7276 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007277
Christian Heimes217cfd12007-12-02 14:31:20 +00007278 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279}
7280
7281static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007282unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283{
7284 if (index < 0 || index >= self->length) {
7285 PyErr_SetString(PyExc_IndexError, "string index out of range");
7286 return NULL;
7287 }
7288
7289 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7290}
7291
Guido van Rossumc2504932007-09-18 19:42:40 +00007292/* Believe it or not, this produces the same value for ASCII strings
7293 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007295unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296{
Guido van Rossumc2504932007-09-18 19:42:40 +00007297 Py_ssize_t len;
7298 Py_UNICODE *p;
7299 long x;
7300
7301 if (self->hash != -1)
7302 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007303 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007304 p = self->str;
7305 x = *p << 7;
7306 while (--len >= 0)
7307 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007308 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007309 if (x == -1)
7310 x = -2;
7311 self->hash = x;
7312 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313}
7314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007315PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
7320static PyObject *
7321unicode_index(PyUnicodeObject *self, PyObject *args)
7322{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007323 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007324 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007325 Py_ssize_t start;
7326 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
Christian Heimes9cd17752007-11-18 19:35:23 +00007328 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
Thomas Wouters477c8d52006-05-27 19:21:47 +00007331 result = stringlib_find_slice(
7332 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7333 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7334 start, end
7335 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336
7337 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007338
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 if (result < 0) {
7340 PyErr_SetString(PyExc_ValueError, "substring not found");
7341 return NULL;
7342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007343
Christian Heimes217cfd12007-12-02 14:31:20 +00007344 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345}
7346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007347PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007350Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007351at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
7353static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007354unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355{
7356 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7357 register const Py_UNICODE *e;
7358 int cased;
7359
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 /* Shortcut for single character strings */
7361 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007364 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007365 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007367
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 e = p + PyUnicode_GET_SIZE(self);
7369 cased = 0;
7370 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007372
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7374 return PyBool_FromLong(0);
7375 else if (!cased && Py_UNICODE_ISLOWER(ch))
7376 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007378 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379}
7380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007381PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007384Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
7387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007388unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389{
7390 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7391 register const Py_UNICODE *e;
7392 int cased;
7393
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 /* Shortcut for single character strings */
7395 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007398 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007399 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007401
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 e = p + PyUnicode_GET_SIZE(self);
7403 cased = 0;
7404 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007406
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7408 return PyBool_FromLong(0);
7409 else if (!cased && Py_UNICODE_ISUPPER(ch))
7410 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007412 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413}
7414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007415PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007418Return True if S is a titlecased string and there is at least one\n\
7419character in S, i.e. upper- and titlecase characters may only\n\
7420follow uncased characters and lowercase characters only cased ones.\n\
7421Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
7423static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007424unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425{
7426 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7427 register const Py_UNICODE *e;
7428 int cased, previous_is_cased;
7429
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 /* Shortcut for single character strings */
7431 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7433 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007435 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007436 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 e = p + PyUnicode_GET_SIZE(self);
7440 cased = 0;
7441 previous_is_cased = 0;
7442 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007444
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7446 if (previous_is_cased)
7447 return PyBool_FromLong(0);
7448 previous_is_cased = 1;
7449 cased = 1;
7450 }
7451 else if (Py_UNICODE_ISLOWER(ch)) {
7452 if (!previous_is_cased)
7453 return PyBool_FromLong(0);
7454 previous_is_cased = 1;
7455 cased = 1;
7456 }
7457 else
7458 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007460 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007463PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007466Return True if all characters in S are whitespace\n\
7467and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468
7469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007470unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
7472 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7473 register const Py_UNICODE *e;
7474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 /* Shortcut for single character strings */
7476 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 Py_UNICODE_ISSPACE(*p))
7478 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007480 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007481 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007483
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 e = p + PyUnicode_GET_SIZE(self);
7485 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 if (!Py_UNICODE_ISSPACE(*p))
7487 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007489 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007494\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007495Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007497
7498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007499unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007500{
7501 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7502 register const Py_UNICODE *e;
7503
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007504 /* Shortcut for single character strings */
7505 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 Py_UNICODE_ISALPHA(*p))
7507 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007508
7509 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007510 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007512
7513 e = p + PyUnicode_GET_SIZE(self);
7514 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 if (!Py_UNICODE_ISALPHA(*p))
7516 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007517 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007518 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007519}
7520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007523\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007524Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007526
7527static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007528unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007529{
7530 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7531 register const Py_UNICODE *e;
7532
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007533 /* Shortcut for single character strings */
7534 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 Py_UNICODE_ISALNUM(*p))
7536 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007537
7538 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007539 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541
7542 e = p + PyUnicode_GET_SIZE(self);
7543 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 if (!Py_UNICODE_ISALNUM(*p))
7545 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007546 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007547 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007548}
7549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007550PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007553Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555
7556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007557unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558{
7559 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7560 register const Py_UNICODE *e;
7561
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 /* Shortcut for single character strings */
7563 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 Py_UNICODE_ISDECIMAL(*p))
7565 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007567 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007568 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007570
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 e = p + PyUnicode_GET_SIZE(self);
7572 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 if (!Py_UNICODE_ISDECIMAL(*p))
7574 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007576 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007579PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007582Return True if all characters in S are digits\n\
7583and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007586unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
7588 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7589 register const Py_UNICODE *e;
7590
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 /* Shortcut for single character strings */
7592 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 Py_UNICODE_ISDIGIT(*p))
7594 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007596 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007597 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007599
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 e = p + PyUnicode_GET_SIZE(self);
7601 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 if (!Py_UNICODE_ISDIGIT(*p))
7603 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007605 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606}
7607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007608PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007611Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613
7614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007615unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616{
7617 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7618 register const Py_UNICODE *e;
7619
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 /* Shortcut for single character strings */
7621 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 Py_UNICODE_ISNUMERIC(*p))
7623 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007625 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007626 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007628
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 e = p + PyUnicode_GET_SIZE(self);
7630 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 if (!Py_UNICODE_ISNUMERIC(*p))
7632 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007634 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635}
7636
Martin v. Löwis47383402007-08-15 07:32:56 +00007637int
7638PyUnicode_IsIdentifier(PyObject *self)
7639{
7640 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7641 register const Py_UNICODE *e;
7642
7643 /* Special case for empty strings */
7644 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007646
7647 /* PEP 3131 says that the first character must be in
7648 XID_Start and subsequent characters in XID_Continue,
7649 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007651 letters, digits, underscore). However, given the current
7652 definition of XID_Start and XID_Continue, it is sufficient
7653 to check just for these, except that _ must be allowed
7654 as starting an identifier. */
7655 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7656 return 0;
7657
7658 e = p + PyUnicode_GET_SIZE(self);
7659 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 if (!_PyUnicode_IsXidContinue(*p))
7661 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007662 }
7663 return 1;
7664}
7665
7666PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007668\n\
7669Return True if S is a valid identifier according\n\
7670to the language definition.");
7671
7672static PyObject*
7673unicode_isidentifier(PyObject *self)
7674{
7675 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7676}
7677
Georg Brandl559e5d72008-06-11 18:37:52 +00007678PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007680\n\
7681Return True if all characters in S are considered\n\
7682printable in repr() or S is empty, False otherwise.");
7683
7684static PyObject*
7685unicode_isprintable(PyObject *self)
7686{
7687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7688 register const Py_UNICODE *e;
7689
7690 /* Shortcut for single character strings */
7691 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7692 Py_RETURN_TRUE;
7693 }
7694
7695 e = p + PyUnicode_GET_SIZE(self);
7696 for (; p < e; p++) {
7697 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7698 Py_RETURN_FALSE;
7699 }
7700 }
7701 Py_RETURN_TRUE;
7702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
7707Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709
7710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007711unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007713 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714}
7715
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717unicode_length(PyUnicodeObject *self)
7718{
7719 return self->length;
7720}
7721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007722PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007725Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007726done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
7728static PyObject *
7729unicode_ljust(PyUnicodeObject *self, PyObject *args)
7730{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007731 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007732 Py_UNICODE fillchar = ' ';
7733
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007734 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 return NULL;
7736
Tim Peters7a29bd52001-09-12 03:03:31 +00007737 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 Py_INCREF(self);
7739 return (PyObject*) self;
7740 }
7741
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007742 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743}
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007751unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 return fixup(self, fixlower);
7754}
7755
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007756#define LEFTSTRIP 0
7757#define RIGHTSTRIP 1
7758#define BOTHSTRIP 2
7759
7760/* Arrays indexed by above */
7761static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7762
7763#define STRIPNAME(i) (stripformat[i]+3)
7764
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007765/* externally visible for str.strip(unicode) */
7766PyObject *
7767_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7770 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7771 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7772 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7773 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007774
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007776
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 i = 0;
7778 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7780 i++;
7781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 j = len;
7785 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 do {
7787 j--;
7788 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7789 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007791
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 Py_INCREF(self);
7794 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007795 }
7796 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007798}
7799
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
7801static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007802do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7805 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806
Benjamin Peterson14339b62009-01-31 16:36:08 +00007807 i = 0;
7808 if (striptype != RIGHTSTRIP) {
7809 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7810 i++;
7811 }
7812 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 j = len;
7815 if (striptype != LEFTSTRIP) {
7816 do {
7817 j--;
7818 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7819 j++;
7820 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007821
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7823 Py_INCREF(self);
7824 return (PyObject*)self;
7825 }
7826 else
7827 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828}
7829
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007830
7831static PyObject *
7832do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007835
Benjamin Peterson14339b62009-01-31 16:36:08 +00007836 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7837 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 if (sep != NULL && sep != Py_None) {
7840 if (PyUnicode_Check(sep))
7841 return _PyUnicode_XStrip(self, striptype, sep);
7842 else {
7843 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 "%s arg must be None or str",
7845 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 return NULL;
7847 }
7848 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007849
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851}
7852
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007856\n\
7857Return a copy of the string S with leading and trailing\n\
7858whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007859If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007860
7861static PyObject *
7862unicode_strip(PyUnicodeObject *self, PyObject *args)
7863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 if (PyTuple_GET_SIZE(args) == 0)
7865 return do_strip(self, BOTHSTRIP); /* Common case */
7866 else
7867 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007868}
7869
7870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007871PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007873\n\
7874Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007875If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876
7877static PyObject *
7878unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7879{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 if (PyTuple_GET_SIZE(args) == 0)
7881 return do_strip(self, LEFTSTRIP); /* Common case */
7882 else
7883 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884}
7885
7886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007889\n\
7890Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007891If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892
7893static PyObject *
7894unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 if (PyTuple_GET_SIZE(args) == 0)
7897 return do_strip(self, RIGHTSTRIP); /* Common case */
7898 else
7899 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007900}
7901
7902
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007904unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905{
7906 PyUnicodeObject *u;
7907 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007908 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007909 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
Georg Brandl222de0f2009-04-12 12:01:50 +00007911 if (len < 1) {
7912 Py_INCREF(unicode_empty);
7913 return (PyObject *)unicode_empty;
7914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915
Tim Peters7a29bd52001-09-12 03:03:31 +00007916 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 /* no repeat, return original string */
7918 Py_INCREF(str);
7919 return (PyObject*) str;
7920 }
Tim Peters8f422462000-09-09 06:13:41 +00007921
7922 /* ensure # of chars needed doesn't overflow int and # of bytes
7923 * needed doesn't overflow size_t
7924 */
7925 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007926 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007927 PyErr_SetString(PyExc_OverflowError,
7928 "repeated string is too long");
7929 return NULL;
7930 }
7931 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7932 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7933 PyErr_SetString(PyExc_OverflowError,
7934 "repeated string is too long");
7935 return NULL;
7936 }
7937 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 if (!u)
7939 return NULL;
7940
7941 p = u->str;
7942
Georg Brandl222de0f2009-04-12 12:01:50 +00007943 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007944 Py_UNICODE_FILL(p, str->str[0], len);
7945 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007946 Py_ssize_t done = str->length; /* number of characters copied this far */
7947 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007949 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007950 Py_UNICODE_COPY(p+done, p, n);
7951 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 }
7954
7955 return (PyObject*) u;
7956}
7957
7958PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 PyObject *subobj,
7960 PyObject *replobj,
7961 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
7963 PyObject *self;
7964 PyObject *str1;
7965 PyObject *str2;
7966 PyObject *result;
7967
7968 self = PyUnicode_FromObject(obj);
7969 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 str1 = PyUnicode_FromObject(subobj);
7972 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 Py_DECREF(self);
7974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
7976 str2 = PyUnicode_FromObject(replobj);
7977 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 Py_DECREF(self);
7979 Py_DECREF(str1);
7980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 }
Tim Petersced69f82003-09-16 20:30:58 +00007982 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 (PyUnicodeObject *)str1,
7984 (PyUnicodeObject *)str2,
7985 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 Py_DECREF(self);
7987 Py_DECREF(str1);
7988 Py_DECREF(str2);
7989 return result;
7990}
7991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007992PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994\n\
7995Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007996old replaced by new. If the optional argument count is\n\
7997given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
7999static PyObject*
8000unicode_replace(PyUnicodeObject *self, PyObject *args)
8001{
8002 PyUnicodeObject *str1;
8003 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008004 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 PyObject *result;
8006
Martin v. Löwis18e16552006-02-15 17:27:45 +00008007 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 return NULL;
8009 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8010 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008013 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_DECREF(str1);
8015 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017
8018 result = replace(self, str1, str2, maxcount);
8019
8020 Py_DECREF(str1);
8021 Py_DECREF(str2);
8022 return result;
8023}
8024
8025static
8026PyObject *unicode_repr(PyObject *unicode)
8027{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008028 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008029 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008030 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8031 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8032
8033 /* XXX(nnorwitz): rather than over-allocating, it would be
8034 better to choose a different scheme. Perhaps scan the
8035 first N-chars of the string and allocate based on that size.
8036 */
8037 /* Initial allocation is based on the longest-possible unichr
8038 escape.
8039
8040 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8041 unichr, so in this case it's the longest unichr escape. In
8042 narrow (UTF-16) builds this is five chars per source unichr
8043 since there are two unichrs in the surrogate pair, so in narrow
8044 (UTF-16) builds it's not the longest unichr escape.
8045
8046 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8047 so in the narrow (UTF-16) build case it's the longest unichr
8048 escape.
8049 */
8050
Walter Dörwald1ab83302007-05-18 17:15:44 +00008051 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008053#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008055#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008057#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008059 if (repr == NULL)
8060 return NULL;
8061
Walter Dörwald1ab83302007-05-18 17:15:44 +00008062 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008063
8064 /* Add quote */
8065 *p++ = (findchar(s, size, '\'') &&
8066 !findchar(s, size, '"')) ? '"' : '\'';
8067 while (size-- > 0) {
8068 Py_UNICODE ch = *s++;
8069
8070 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008071 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008072 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008073 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008074 continue;
8075 }
8076
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008078 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008079 *p++ = '\\';
8080 *p++ = 't';
8081 }
8082 else if (ch == '\n') {
8083 *p++ = '\\';
8084 *p++ = 'n';
8085 }
8086 else if (ch == '\r') {
8087 *p++ = '\\';
8088 *p++ = 'r';
8089 }
8090
8091 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008092 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008093 *p++ = '\\';
8094 *p++ = 'x';
8095 *p++ = hexdigits[(ch >> 4) & 0x000F];
8096 *p++ = hexdigits[ch & 0x000F];
8097 }
8098
Georg Brandl559e5d72008-06-11 18:37:52 +00008099 /* Copy ASCII characters as-is */
8100 else if (ch < 0x7F) {
8101 *p++ = ch;
8102 }
8103
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008105 else {
8106 Py_UCS4 ucs = ch;
8107
8108#ifndef Py_UNICODE_WIDE
8109 Py_UNICODE ch2 = 0;
8110 /* Get code point from surrogate pair */
8111 if (size > 0) {
8112 ch2 = *s;
8113 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008115 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008118 size--;
8119 }
8120 }
8121#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008122 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008123 (categories Z* and C* except ASCII space)
8124 */
8125 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8126 /* Map 8-bit characters to '\xhh' */
8127 if (ucs <= 0xff) {
8128 *p++ = '\\';
8129 *p++ = 'x';
8130 *p++ = hexdigits[(ch >> 4) & 0x000F];
8131 *p++ = hexdigits[ch & 0x000F];
8132 }
8133 /* Map 21-bit characters to '\U00xxxxxx' */
8134 else if (ucs >= 0x10000) {
8135 *p++ = '\\';
8136 *p++ = 'U';
8137 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8138 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8139 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8140 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8141 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8142 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8143 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8144 *p++ = hexdigits[ucs & 0x0000000F];
8145 }
8146 /* Map 16-bit characters to '\uxxxx' */
8147 else {
8148 *p++ = '\\';
8149 *p++ = 'u';
8150 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8151 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8152 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8153 *p++ = hexdigits[ucs & 0x000F];
8154 }
8155 }
8156 /* Copy characters as-is */
8157 else {
8158 *p++ = ch;
8159#ifndef Py_UNICODE_WIDE
8160 if (ucs >= 0x10000)
8161 *p++ = ch2;
8162#endif
8163 }
8164 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008165 }
8166 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008167 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008168
8169 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008170 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008171 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172}
8173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176\n\
8177Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008178such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179arguments start and end are interpreted as in slice notation.\n\
8180\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008181Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
8183static PyObject *
8184unicode_rfind(PyUnicodeObject *self, PyObject *args)
8185{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008186 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008187 Py_ssize_t start;
8188 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008189 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190
Christian Heimes9cd17752007-11-18 19:35:23 +00008191 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194 result = stringlib_rfind_slice(
8195 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8196 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8197 start, end
8198 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199
8200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008201
Christian Heimes217cfd12007-12-02 14:31:20 +00008202 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203}
8204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008205PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008208Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
8210static PyObject *
8211unicode_rindex(PyUnicodeObject *self, PyObject *args)
8212{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008213 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008214 Py_ssize_t start;
8215 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008216 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217
Christian Heimes9cd17752007-11-18 19:35:23 +00008218 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220
Thomas Wouters477c8d52006-05-27 19:21:47 +00008221 result = stringlib_rfind_slice(
8222 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8223 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8224 start, end
8225 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226
8227 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 if (result < 0) {
8230 PyErr_SetString(PyExc_ValueError, "substring not found");
8231 return NULL;
8232 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008233 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
8235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008236PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008239Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008240done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241
8242static PyObject *
8243unicode_rjust(PyUnicodeObject *self, PyObject *args)
8244{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008245 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008246 Py_UNICODE fillchar = ' ';
8247
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008248 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 return NULL;
8250
Tim Peters7a29bd52001-09-12 03:03:31 +00008251 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 Py_INCREF(self);
8253 return (PyObject*) self;
8254 }
8255
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008256 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257}
8258
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 PyObject *sep,
8261 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262{
8263 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008264
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 s = PyUnicode_FromObject(s);
8266 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 if (sep != NULL) {
8269 sep = PyUnicode_FromObject(sep);
8270 if (sep == NULL) {
8271 Py_DECREF(s);
8272 return NULL;
8273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
8275
8276 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8277
8278 Py_DECREF(s);
8279 Py_XDECREF(sep);
8280 return result;
8281}
8282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008283PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285\n\
8286Return a list of the words in S, using sep as the\n\
8287delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008288splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008289whitespace string is a separator and empty strings are\n\
8290removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291
8292static PyObject*
8293unicode_split(PyUnicodeObject *self, PyObject *args)
8294{
8295 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Martin v. Löwis18e16552006-02-15 17:27:45 +00008298 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 return NULL;
8300
8301 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307}
8308
Thomas Wouters477c8d52006-05-27 19:21:47 +00008309PyObject *
8310PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8311{
8312 PyObject* str_obj;
8313 PyObject* sep_obj;
8314 PyObject* out;
8315
8316 str_obj = PyUnicode_FromObject(str_in);
8317 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319 sep_obj = PyUnicode_FromObject(sep_in);
8320 if (!sep_obj) {
8321 Py_DECREF(str_obj);
8322 return NULL;
8323 }
8324
8325 out = stringlib_partition(
8326 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8327 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8328 );
8329
8330 Py_DECREF(sep_obj);
8331 Py_DECREF(str_obj);
8332
8333 return out;
8334}
8335
8336
8337PyObject *
8338PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8339{
8340 PyObject* str_obj;
8341 PyObject* sep_obj;
8342 PyObject* out;
8343
8344 str_obj = PyUnicode_FromObject(str_in);
8345 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347 sep_obj = PyUnicode_FromObject(sep_in);
8348 if (!sep_obj) {
8349 Py_DECREF(str_obj);
8350 return NULL;
8351 }
8352
8353 out = stringlib_rpartition(
8354 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8355 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8356 );
8357
8358 Py_DECREF(sep_obj);
8359 Py_DECREF(str_obj);
8360
8361 return out;
8362}
8363
8364PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008366\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008367Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008368the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008369found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008370
8371static PyObject*
8372unicode_partition(PyUnicodeObject *self, PyObject *separator)
8373{
8374 return PyUnicode_Partition((PyObject *)self, separator);
8375}
8376
8377PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008380Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008381the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008382separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383
8384static PyObject*
8385unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8386{
8387 return PyUnicode_RPartition((PyObject *)self, separator);
8388}
8389
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008390PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 PyObject *sep,
8392 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008393{
8394 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008396 s = PyUnicode_FromObject(s);
8397 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 if (sep != NULL) {
8400 sep = PyUnicode_FromObject(sep);
8401 if (sep == NULL) {
8402 Py_DECREF(s);
8403 return NULL;
8404 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008405 }
8406
8407 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8408
8409 Py_DECREF(s);
8410 Py_XDECREF(sep);
8411 return result;
8412}
8413
8414PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008416\n\
8417Return a list of the words in S, using sep as the\n\
8418delimiter string, starting at the end of the string and\n\
8419working to the front. If maxsplit is given, at most maxsplit\n\
8420splits are done. If sep is not specified, any whitespace string\n\
8421is a separator.");
8422
8423static PyObject*
8424unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8425{
8426 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008428
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008430 return NULL;
8431
8432 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008434 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008436 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008438}
8439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008440PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442\n\
8443Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008444Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008445is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
8447static PyObject*
8448unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8449{
Guido van Rossum86662912000-04-11 15:38:46 +00008450 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451
Guido van Rossum86662912000-04-11 15:38:46 +00008452 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 return NULL;
8454
Guido van Rossum86662912000-04-11 15:38:46 +00008455 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456}
8457
8458static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008459PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460{
Walter Dörwald346737f2007-05-31 10:44:43 +00008461 if (PyUnicode_CheckExact(self)) {
8462 Py_INCREF(self);
8463 return self;
8464 } else
8465 /* Subtype -- return genuine unicode string with the same value. */
8466 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8467 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468}
8469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008470PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472\n\
8473Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008474and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475
8476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008477unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 return fixup(self, fixswapcase);
8480}
8481
Georg Brandlceee0772007-11-27 23:48:05 +00008482PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008484\n\
8485Return a translation table usable for str.translate().\n\
8486If there is only one argument, it must be a dictionary mapping Unicode\n\
8487ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008488Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008489If there are two arguments, they must be strings of equal length, and\n\
8490in the resulting dictionary, each character in x will be mapped to the\n\
8491character at the same position in y. If there is a third argument, it\n\
8492must be a string, whose characters will be mapped to None in the result.");
8493
8494static PyObject*
8495unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8496{
8497 PyObject *x, *y = NULL, *z = NULL;
8498 PyObject *new = NULL, *key, *value;
8499 Py_ssize_t i = 0;
8500 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501
Georg Brandlceee0772007-11-27 23:48:05 +00008502 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8503 return NULL;
8504 new = PyDict_New();
8505 if (!new)
8506 return NULL;
8507 if (y != NULL) {
8508 /* x must be a string too, of equal length */
8509 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8510 if (!PyUnicode_Check(x)) {
8511 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8512 "be a string if there is a second argument");
8513 goto err;
8514 }
8515 if (PyUnicode_GET_SIZE(x) != ylen) {
8516 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8517 "arguments must have equal length");
8518 goto err;
8519 }
8520 /* create entries for translating chars in x to those in y */
8521 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008522 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8523 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008524 if (!key || !value)
8525 goto err;
8526 res = PyDict_SetItem(new, key, value);
8527 Py_DECREF(key);
8528 Py_DECREF(value);
8529 if (res < 0)
8530 goto err;
8531 }
8532 /* create entries for deleting chars in z */
8533 if (z != NULL) {
8534 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008535 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008536 if (!key)
8537 goto err;
8538 res = PyDict_SetItem(new, key, Py_None);
8539 Py_DECREF(key);
8540 if (res < 0)
8541 goto err;
8542 }
8543 }
8544 } else {
8545 /* x must be a dict */
8546 if (!PyDict_Check(x)) {
8547 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8548 "to maketrans it must be a dict");
8549 goto err;
8550 }
8551 /* copy entries into the new dict, converting string keys to int keys */
8552 while (PyDict_Next(x, &i, &key, &value)) {
8553 if (PyUnicode_Check(key)) {
8554 /* convert string keys to integer keys */
8555 PyObject *newkey;
8556 if (PyUnicode_GET_SIZE(key) != 1) {
8557 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8558 "table must be of length 1");
8559 goto err;
8560 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008561 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008562 if (!newkey)
8563 goto err;
8564 res = PyDict_SetItem(new, newkey, value);
8565 Py_DECREF(newkey);
8566 if (res < 0)
8567 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008568 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008569 /* just keep integer keys */
8570 if (PyDict_SetItem(new, key, value) < 0)
8571 goto err;
8572 } else {
8573 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8574 "be strings or integers");
8575 goto err;
8576 }
8577 }
8578 }
8579 return new;
8580 err:
8581 Py_DECREF(new);
8582 return NULL;
8583}
8584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008585PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587\n\
8588Return a copy of the string S, where all characters have been mapped\n\
8589through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008590Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008591Unmapped characters are left untouched. Characters mapped to None\n\
8592are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
8594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008595unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596{
Georg Brandlceee0772007-11-27 23:48:05 +00008597 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008603Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604
8605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008606unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 return fixup(self, fixupper);
8609}
8610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008611PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008614Pad a numeric string S with zeros on the left, to fill a field\n\
8615of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
8617static PyObject *
8618unicode_zfill(PyUnicodeObject *self, PyObject *args)
8619{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008620 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 PyUnicodeObject *u;
8622
Martin v. Löwis18e16552006-02-15 17:27:45 +00008623 Py_ssize_t width;
8624 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 return NULL;
8626
8627 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008628 if (PyUnicode_CheckExact(self)) {
8629 Py_INCREF(self);
8630 return (PyObject*) self;
8631 }
8632 else
8633 return PyUnicode_FromUnicode(
8634 PyUnicode_AS_UNICODE(self),
8635 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
8638
8639 fill = width - self->length;
8640
8641 u = pad(self, fill, 0, '0');
8642
Walter Dörwald068325e2002-04-15 13:36:47 +00008643 if (u == NULL)
8644 return NULL;
8645
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 if (u->str[fill] == '+' || u->str[fill] == '-') {
8647 /* move sign to beginning of string */
8648 u->str[0] = u->str[fill];
8649 u->str[fill] = '0';
8650 }
8651
8652 return (PyObject*) u;
8653}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
8655#if 0
8656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008657unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
Christian Heimes2202f872008-02-06 14:31:34 +00008659 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660}
8661#endif
8662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008663PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008666Return True if S starts with the specified prefix, False otherwise.\n\
8667With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008668With optional end, stop comparing S at that position.\n\
8669prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
8671static PyObject *
8672unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008675 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008677 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008678 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008679 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008681 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8683 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008684 if (PyTuple_Check(subobj)) {
8685 Py_ssize_t i;
8686 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8687 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008689 if (substring == NULL)
8690 return NULL;
8691 result = tailmatch(self, substring, start, end, -1);
8692 Py_DECREF(substring);
8693 if (result) {
8694 Py_RETURN_TRUE;
8695 }
8696 }
8697 /* nothing matched */
8698 Py_RETURN_FALSE;
8699 }
8700 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008703 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008705 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706}
8707
8708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008709PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008712Return True if S ends with the specified suffix, False otherwise.\n\
8713With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008714With optional end, stop comparing S at that position.\n\
8715suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716
8717static PyObject *
8718unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008721 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008723 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008724 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008725 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8729 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 if (PyTuple_Check(subobj)) {
8731 Py_ssize_t i;
8732 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8733 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008735 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008737 result = tailmatch(self, substring, start, end, +1);
8738 Py_DECREF(substring);
8739 if (result) {
8740 Py_RETURN_TRUE;
8741 }
8742 }
8743 Py_RETURN_FALSE;
8744 }
8745 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008749 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008751 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752}
8753
Eric Smith8c663262007-08-25 02:26:07 +00008754#include "stringlib/string_format.h"
8755
8756PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008758\n\
8759");
8760
Eric Smith4a7d76d2008-05-30 18:10:19 +00008761static PyObject *
8762unicode__format__(PyObject* self, PyObject* args)
8763{
8764 PyObject *format_spec;
8765
8766 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8767 return NULL;
8768
8769 return _PyUnicode_FormatAdvanced(self,
8770 PyUnicode_AS_UNICODE(format_spec),
8771 PyUnicode_GET_SIZE(format_spec));
8772}
8773
Eric Smith8c663262007-08-25 02:26:07 +00008774PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008776\n\
8777");
8778
8779static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008780unicode__sizeof__(PyUnicodeObject *v)
8781{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008782 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8783 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008784}
8785
8786PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008788
8789static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008790unicode_getnewargs(PyUnicodeObject *v)
8791{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008792 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008793}
8794
8795
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796static PyMethodDef unicode_methods[] = {
8797
8798 /* Order is according to common usage: often used methods should
8799 appear first, since lookup is done sequentially. */
8800
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008801 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8802 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8803 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008805 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8806 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8807 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8808 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8809 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8810 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8811 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008812 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008813 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8814 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8815 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008816 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008817 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8818 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8819 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008820 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008821 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008822 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008823 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008824 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8825 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8826 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8827 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8828 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8829 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8830 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8831 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8832 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8833 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8834 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8835 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8836 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8837 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008838 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008839 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008840 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008841 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008842 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008843 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8844 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008845 {"maketrans", (PyCFunction) unicode_maketrans,
8846 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008847 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008848#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008849 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850#endif
8851
8852#if 0
8853 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008854 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855#endif
8856
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 {NULL, NULL}
8859};
8860
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008861static PyObject *
8862unicode_mod(PyObject *v, PyObject *w)
8863{
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 if (!PyUnicode_Check(v)) {
8865 Py_INCREF(Py_NotImplemented);
8866 return Py_NotImplemented;
8867 }
8868 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008869}
8870
8871static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 0, /*nb_add*/
8873 0, /*nb_subtract*/
8874 0, /*nb_multiply*/
8875 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008876};
8877
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008879 (lenfunc) unicode_length, /* sq_length */
8880 PyUnicode_Concat, /* sq_concat */
8881 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8882 (ssizeargfunc) unicode_getitem, /* sq_item */
8883 0, /* sq_slice */
8884 0, /* sq_ass_item */
8885 0, /* sq_ass_slice */
8886 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887};
8888
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008889static PyObject*
8890unicode_subscript(PyUnicodeObject* self, PyObject* item)
8891{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008892 if (PyIndex_Check(item)) {
8893 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008894 if (i == -1 && PyErr_Occurred())
8895 return NULL;
8896 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008897 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008898 return unicode_getitem(self, i);
8899 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008900 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008901 Py_UNICODE* source_buf;
8902 Py_UNICODE* result_buf;
8903 PyObject* result;
8904
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008905 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008907 return NULL;
8908 }
8909
8910 if (slicelength <= 0) {
8911 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008912 } else if (start == 0 && step == 1 && slicelength == self->length &&
8913 PyUnicode_CheckExact(self)) {
8914 Py_INCREF(self);
8915 return (PyObject *)self;
8916 } else if (step == 1) {
8917 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008918 } else {
8919 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008920 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8921 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008922
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 if (result_buf == NULL)
8924 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008925
8926 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8927 result_buf[i] = source_buf[cur];
8928 }
Tim Petersced69f82003-09-16 20:30:58 +00008929
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008930 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008931 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008932 return result;
8933 }
8934 } else {
8935 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8936 return NULL;
8937 }
8938}
8939
8940static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 (lenfunc)unicode_length, /* mp_length */
8942 (binaryfunc)unicode_subscript, /* mp_subscript */
8943 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008944};
8945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947/* Helpers for PyUnicode_Format() */
8948
8949static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008950getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008952 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 (*p_argidx)++;
8955 if (arglen < 0)
8956 return args;
8957 else
8958 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 }
8960 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 return NULL;
8963}
8964
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008965/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008967static PyObject *
8968formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008970 char *p;
8971 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008973
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 x = PyFloat_AsDouble(v);
8975 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008976 return NULL;
8977
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008980
Eric Smith0923d1d2009-04-16 20:16:10 +00008981 p = PyOS_double_to_string(x, type, prec,
8982 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008983 if (p == NULL)
8984 return NULL;
8985 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008986 PyMem_Free(p);
8987 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988}
8989
Tim Peters38fd5b62000-09-21 05:43:11 +00008990static PyObject*
8991formatlong(PyObject *val, int flags, int prec, int type)
8992{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008993 char *buf;
8994 int len;
8995 PyObject *str; /* temporary string object. */
8996 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008997
Benjamin Peterson14339b62009-01-31 16:36:08 +00008998 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8999 if (!str)
9000 return NULL;
9001 result = PyUnicode_FromStringAndSize(buf, len);
9002 Py_DECREF(str);
9003 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009004}
9005
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006static int
9007formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009008 size_t buflen,
9009 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009011 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009012 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 if (PyUnicode_GET_SIZE(v) == 1) {
9014 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9015 buf[1] = '\0';
9016 return 1;
9017 }
9018#ifndef Py_UNICODE_WIDE
9019 if (PyUnicode_GET_SIZE(v) == 2) {
9020 /* Decode a valid surrogate pair */
9021 int c0 = PyUnicode_AS_UNICODE(v)[0];
9022 int c1 = PyUnicode_AS_UNICODE(v)[1];
9023 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9024 0xDC00 <= c1 && c1 <= 0xDFFF) {
9025 buf[0] = c0;
9026 buf[1] = c1;
9027 buf[2] = '\0';
9028 return 2;
9029 }
9030 }
9031#endif
9032 goto onError;
9033 }
9034 else {
9035 /* Integer input truncated to a character */
9036 long x;
9037 x = PyLong_AsLong(v);
9038 if (x == -1 && PyErr_Occurred())
9039 goto onError;
9040
9041 if (x < 0 || x > 0x10ffff) {
9042 PyErr_SetString(PyExc_OverflowError,
9043 "%c arg not in range(0x110000)");
9044 return -1;
9045 }
9046
9047#ifndef Py_UNICODE_WIDE
9048 if (x > 0xffff) {
9049 x -= 0x10000;
9050 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9051 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9052 return 2;
9053 }
9054#endif
9055 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009056 buf[1] = '\0';
9057 return 1;
9058 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009059
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009061 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009063 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064}
9065
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009066/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009067 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009068*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009069#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009070
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073{
9074 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009075 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 int args_owned = 0;
9077 PyUnicodeObject *result = NULL;
9078 PyObject *dict = NULL;
9079 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009080
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 PyErr_BadInternalCall();
9083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 }
9085 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009086 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 fmt = PyUnicode_AS_UNICODE(uformat);
9089 fmtcnt = PyUnicode_GET_SIZE(uformat);
9090
9091 reslen = rescnt = fmtcnt + 100;
9092 result = _PyUnicode_New(reslen);
9093 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 res = PyUnicode_AS_UNICODE(result);
9096
9097 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 arglen = PyTuple_Size(args);
9099 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 }
9101 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 arglen = -1;
9103 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009105 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009106 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108
9109 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 if (*fmt != '%') {
9111 if (--rescnt < 0) {
9112 rescnt = fmtcnt + 100;
9113 reslen += rescnt;
9114 if (_PyUnicode_Resize(&result, reslen) < 0)
9115 goto onError;
9116 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9117 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009118 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 }
9121 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 /* Got a format specifier */
9123 int flags = 0;
9124 Py_ssize_t width = -1;
9125 int prec = -1;
9126 Py_UNICODE c = '\0';
9127 Py_UNICODE fill;
9128 int isnumok;
9129 PyObject *v = NULL;
9130 PyObject *temp = NULL;
9131 Py_UNICODE *pbuf;
9132 Py_UNICODE sign;
9133 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009134 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 fmt++;
9137 if (*fmt == '(') {
9138 Py_UNICODE *keystart;
9139 Py_ssize_t keylen;
9140 PyObject *key;
9141 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009142
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 if (dict == NULL) {
9144 PyErr_SetString(PyExc_TypeError,
9145 "format requires a mapping");
9146 goto onError;
9147 }
9148 ++fmt;
9149 --fmtcnt;
9150 keystart = fmt;
9151 /* Skip over balanced parentheses */
9152 while (pcount > 0 && --fmtcnt >= 0) {
9153 if (*fmt == ')')
9154 --pcount;
9155 else if (*fmt == '(')
9156 ++pcount;
9157 fmt++;
9158 }
9159 keylen = fmt - keystart - 1;
9160 if (fmtcnt < 0 || pcount > 0) {
9161 PyErr_SetString(PyExc_ValueError,
9162 "incomplete format key");
9163 goto onError;
9164 }
9165#if 0
9166 /* keys are converted to strings using UTF-8 and
9167 then looked up since Python uses strings to hold
9168 variables names etc. in its namespaces and we
9169 wouldn't want to break common idioms. */
9170 key = PyUnicode_EncodeUTF8(keystart,
9171 keylen,
9172 NULL);
9173#else
9174 key = PyUnicode_FromUnicode(keystart, keylen);
9175#endif
9176 if (key == NULL)
9177 goto onError;
9178 if (args_owned) {
9179 Py_DECREF(args);
9180 args_owned = 0;
9181 }
9182 args = PyObject_GetItem(dict, key);
9183 Py_DECREF(key);
9184 if (args == NULL) {
9185 goto onError;
9186 }
9187 args_owned = 1;
9188 arglen = -1;
9189 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 while (--fmtcnt >= 0) {
9192 switch (c = *fmt++) {
9193 case '-': flags |= F_LJUST; continue;
9194 case '+': flags |= F_SIGN; continue;
9195 case ' ': flags |= F_BLANK; continue;
9196 case '#': flags |= F_ALT; continue;
9197 case '0': flags |= F_ZERO; continue;
9198 }
9199 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 if (c == '*') {
9202 v = getnextarg(args, arglen, &argidx);
9203 if (v == NULL)
9204 goto onError;
9205 if (!PyLong_Check(v)) {
9206 PyErr_SetString(PyExc_TypeError,
9207 "* wants int");
9208 goto onError;
9209 }
9210 width = PyLong_AsLong(v);
9211 if (width == -1 && PyErr_Occurred())
9212 goto onError;
9213 if (width < 0) {
9214 flags |= F_LJUST;
9215 width = -width;
9216 }
9217 if (--fmtcnt >= 0)
9218 c = *fmt++;
9219 }
9220 else if (c >= '0' && c <= '9') {
9221 width = c - '0';
9222 while (--fmtcnt >= 0) {
9223 c = *fmt++;
9224 if (c < '0' || c > '9')
9225 break;
9226 if ((width*10) / 10 != width) {
9227 PyErr_SetString(PyExc_ValueError,
9228 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009229 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 }
9231 width = width*10 + (c - '0');
9232 }
9233 }
9234 if (c == '.') {
9235 prec = 0;
9236 if (--fmtcnt >= 0)
9237 c = *fmt++;
9238 if (c == '*') {
9239 v = getnextarg(args, arglen, &argidx);
9240 if (v == NULL)
9241 goto onError;
9242 if (!PyLong_Check(v)) {
9243 PyErr_SetString(PyExc_TypeError,
9244 "* wants int");
9245 goto onError;
9246 }
9247 prec = PyLong_AsLong(v);
9248 if (prec == -1 && PyErr_Occurred())
9249 goto onError;
9250 if (prec < 0)
9251 prec = 0;
9252 if (--fmtcnt >= 0)
9253 c = *fmt++;
9254 }
9255 else if (c >= '0' && c <= '9') {
9256 prec = c - '0';
9257 while (--fmtcnt >= 0) {
9258 c = Py_CHARMASK(*fmt++);
9259 if (c < '0' || c > '9')
9260 break;
9261 if ((prec*10) / 10 != prec) {
9262 PyErr_SetString(PyExc_ValueError,
9263 "prec too big");
9264 goto onError;
9265 }
9266 prec = prec*10 + (c - '0');
9267 }
9268 }
9269 } /* prec */
9270 if (fmtcnt >= 0) {
9271 if (c == 'h' || c == 'l' || c == 'L') {
9272 if (--fmtcnt >= 0)
9273 c = *fmt++;
9274 }
9275 }
9276 if (fmtcnt < 0) {
9277 PyErr_SetString(PyExc_ValueError,
9278 "incomplete format");
9279 goto onError;
9280 }
9281 if (c != '%') {
9282 v = getnextarg(args, arglen, &argidx);
9283 if (v == NULL)
9284 goto onError;
9285 }
9286 sign = 0;
9287 fill = ' ';
9288 switch (c) {
9289
9290 case '%':
9291 pbuf = formatbuf;
9292 /* presume that buffer length is at least 1 */
9293 pbuf[0] = '%';
9294 len = 1;
9295 break;
9296
9297 case 's':
9298 case 'r':
9299 case 'a':
9300 if (PyUnicode_Check(v) && c == 's') {
9301 temp = v;
9302 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009303 }
9304 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 if (c == 's')
9306 temp = PyObject_Str(v);
9307 else if (c == 'r')
9308 temp = PyObject_Repr(v);
9309 else
9310 temp = PyObject_ASCII(v);
9311 if (temp == NULL)
9312 goto onError;
9313 if (PyUnicode_Check(temp))
9314 /* nothing to do */;
9315 else {
9316 Py_DECREF(temp);
9317 PyErr_SetString(PyExc_TypeError,
9318 "%s argument has non-string str()");
9319 goto onError;
9320 }
9321 }
9322 pbuf = PyUnicode_AS_UNICODE(temp);
9323 len = PyUnicode_GET_SIZE(temp);
9324 if (prec >= 0 && len > prec)
9325 len = prec;
9326 break;
9327
9328 case 'i':
9329 case 'd':
9330 case 'u':
9331 case 'o':
9332 case 'x':
9333 case 'X':
9334 if (c == 'i')
9335 c = 'd';
9336 isnumok = 0;
9337 if (PyNumber_Check(v)) {
9338 PyObject *iobj=NULL;
9339
9340 if (PyLong_Check(v)) {
9341 iobj = v;
9342 Py_INCREF(iobj);
9343 }
9344 else {
9345 iobj = PyNumber_Long(v);
9346 }
9347 if (iobj!=NULL) {
9348 if (PyLong_Check(iobj)) {
9349 isnumok = 1;
9350 temp = formatlong(iobj, flags, prec, c);
9351 Py_DECREF(iobj);
9352 if (!temp)
9353 goto onError;
9354 pbuf = PyUnicode_AS_UNICODE(temp);
9355 len = PyUnicode_GET_SIZE(temp);
9356 sign = 1;
9357 }
9358 else {
9359 Py_DECREF(iobj);
9360 }
9361 }
9362 }
9363 if (!isnumok) {
9364 PyErr_Format(PyExc_TypeError,
9365 "%%%c format: a number is required, "
9366 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9367 goto onError;
9368 }
9369 if (flags & F_ZERO)
9370 fill = '0';
9371 break;
9372
9373 case 'e':
9374 case 'E':
9375 case 'f':
9376 case 'F':
9377 case 'g':
9378 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379 temp = formatfloat(v, flags, prec, c);
9380 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009382 pbuf = PyUnicode_AS_UNICODE(temp);
9383 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 sign = 1;
9385 if (flags & F_ZERO)
9386 fill = '0';
9387 break;
9388
9389 case 'c':
9390 pbuf = formatbuf;
9391 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9392 if (len < 0)
9393 goto onError;
9394 break;
9395
9396 default:
9397 PyErr_Format(PyExc_ValueError,
9398 "unsupported format character '%c' (0x%x) "
9399 "at index %zd",
9400 (31<=c && c<=126) ? (char)c : '?',
9401 (int)c,
9402 (Py_ssize_t)(fmt - 1 -
9403 PyUnicode_AS_UNICODE(uformat)));
9404 goto onError;
9405 }
9406 if (sign) {
9407 if (*pbuf == '-' || *pbuf == '+') {
9408 sign = *pbuf++;
9409 len--;
9410 }
9411 else if (flags & F_SIGN)
9412 sign = '+';
9413 else if (flags & F_BLANK)
9414 sign = ' ';
9415 else
9416 sign = 0;
9417 }
9418 if (width < len)
9419 width = len;
9420 if (rescnt - (sign != 0) < width) {
9421 reslen -= rescnt;
9422 rescnt = width + fmtcnt + 100;
9423 reslen += rescnt;
9424 if (reslen < 0) {
9425 Py_XDECREF(temp);
9426 PyErr_NoMemory();
9427 goto onError;
9428 }
9429 if (_PyUnicode_Resize(&result, reslen) < 0) {
9430 Py_XDECREF(temp);
9431 goto onError;
9432 }
9433 res = PyUnicode_AS_UNICODE(result)
9434 + reslen - rescnt;
9435 }
9436 if (sign) {
9437 if (fill != ' ')
9438 *res++ = sign;
9439 rescnt--;
9440 if (width > len)
9441 width--;
9442 }
9443 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9444 assert(pbuf[0] == '0');
9445 assert(pbuf[1] == c);
9446 if (fill != ' ') {
9447 *res++ = *pbuf++;
9448 *res++ = *pbuf++;
9449 }
9450 rescnt -= 2;
9451 width -= 2;
9452 if (width < 0)
9453 width = 0;
9454 len -= 2;
9455 }
9456 if (width > len && !(flags & F_LJUST)) {
9457 do {
9458 --rescnt;
9459 *res++ = fill;
9460 } while (--width > len);
9461 }
9462 if (fill == ' ') {
9463 if (sign)
9464 *res++ = sign;
9465 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9466 assert(pbuf[0] == '0');
9467 assert(pbuf[1] == c);
9468 *res++ = *pbuf++;
9469 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009470 }
9471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 Py_UNICODE_COPY(res, pbuf, len);
9473 res += len;
9474 rescnt -= len;
9475 while (--width >= len) {
9476 --rescnt;
9477 *res++ = ' ';
9478 }
9479 if (dict && (argidx < arglen) && c != '%') {
9480 PyErr_SetString(PyExc_TypeError,
9481 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009482 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 goto onError;
9484 }
9485 Py_XDECREF(temp);
9486 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 } /* until end */
9488 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 PyErr_SetString(PyExc_TypeError,
9490 "not all arguments converted during string formatting");
9491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
9493
Thomas Woutersa96affe2006-03-12 00:29:36 +00009494 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
9499 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 return (PyObject *)result;
9501
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 Py_XDECREF(result);
9504 Py_DECREF(uformat);
9505 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
9508 return NULL;
9509}
9510
Jeremy Hylton938ace62002-07-17 16:30:39 +00009511static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009512unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9513
Tim Peters6d6c1a32001-08-02 04:15:00 +00009514static PyObject *
9515unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9516{
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009518 static char *kwlist[] = {"object", "encoding", "errors", 0};
9519 char *encoding = NULL;
9520 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009521
Benjamin Peterson14339b62009-01-31 16:36:08 +00009522 if (type != &PyUnicode_Type)
9523 return unicode_subtype_new(type, args, kwds);
9524 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 return NULL;
9527 if (x == NULL)
9528 return (PyObject *)_PyUnicode_New(0);
9529 if (encoding == NULL && errors == NULL)
9530 return PyObject_Str(x);
9531 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009533}
9534
Guido van Rossume023fe02001-08-30 03:12:59 +00009535static PyObject *
9536unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9537{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009538 PyUnicodeObject *tmp, *pnew;
9539 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009540
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9542 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9543 if (tmp == NULL)
9544 return NULL;
9545 assert(PyUnicode_Check(tmp));
9546 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9547 if (pnew == NULL) {
9548 Py_DECREF(tmp);
9549 return NULL;
9550 }
9551 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9552 if (pnew->str == NULL) {
9553 _Py_ForgetReference((PyObject *)pnew);
9554 PyObject_Del(pnew);
9555 Py_DECREF(tmp);
9556 return PyErr_NoMemory();
9557 }
9558 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9559 pnew->length = n;
9560 pnew->hash = tmp->hash;
9561 Py_DECREF(tmp);
9562 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009563}
9564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009565PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009567\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009568Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009569encoding defaults to the current default string encoding.\n\
9570errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009571
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009572static PyObject *unicode_iter(PyObject *seq);
9573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009575 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009576 "str", /* tp_name */
9577 sizeof(PyUnicodeObject), /* tp_size */
9578 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 (destructor)unicode_dealloc, /* tp_dealloc */
9581 0, /* tp_print */
9582 0, /* tp_getattr */
9583 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009584 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009585 unicode_repr, /* tp_repr */
9586 &unicode_as_number, /* tp_as_number */
9587 &unicode_as_sequence, /* tp_as_sequence */
9588 &unicode_as_mapping, /* tp_as_mapping */
9589 (hashfunc) unicode_hash, /* tp_hash*/
9590 0, /* tp_call*/
9591 (reprfunc) unicode_str, /* tp_str */
9592 PyObject_GenericGetAttr, /* tp_getattro */
9593 0, /* tp_setattro */
9594 0, /* tp_as_buffer */
9595 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009597 unicode_doc, /* tp_doc */
9598 0, /* tp_traverse */
9599 0, /* tp_clear */
9600 PyUnicode_RichCompare, /* tp_richcompare */
9601 0, /* tp_weaklistoffset */
9602 unicode_iter, /* tp_iter */
9603 0, /* tp_iternext */
9604 unicode_methods, /* tp_methods */
9605 0, /* tp_members */
9606 0, /* tp_getset */
9607 &PyBaseObject_Type, /* tp_base */
9608 0, /* tp_dict */
9609 0, /* tp_descr_get */
9610 0, /* tp_descr_set */
9611 0, /* tp_dictoffset */
9612 0, /* tp_init */
9613 0, /* tp_alloc */
9614 unicode_new, /* tp_new */
9615 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616};
9617
9618/* Initialize the Unicode implementation */
9619
Thomas Wouters78890102000-07-22 19:25:51 +00009620void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009622 int i;
9623
Thomas Wouters477c8d52006-05-27 19:21:47 +00009624 /* XXX - move this array to unicodectype.c ? */
9625 Py_UNICODE linebreak[] = {
9626 0x000A, /* LINE FEED */
9627 0x000D, /* CARRIAGE RETURN */
9628 0x001C, /* FILE SEPARATOR */
9629 0x001D, /* GROUP SEPARATOR */
9630 0x001E, /* RECORD SEPARATOR */
9631 0x0085, /* NEXT LINE */
9632 0x2028, /* LINE SEPARATOR */
9633 0x2029, /* PARAGRAPH SEPARATOR */
9634 };
9635
Fred Drakee4315f52000-05-09 19:53:39 +00009636 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009637 free_list = NULL;
9638 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009640 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009642
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009643 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009645 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009647
9648 /* initialize the linebreak bloom filter */
9649 bloom_linebreak = make_bloom_mask(
9650 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9651 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009652
9653 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654}
9655
9656/* Finalize the Unicode implementation */
9657
Christian Heimesa156e092008-02-16 07:38:31 +00009658int
9659PyUnicode_ClearFreeList(void)
9660{
9661 int freelist_size = numfree;
9662 PyUnicodeObject *u;
9663
9664 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 PyUnicodeObject *v = u;
9666 u = *(PyUnicodeObject **)u;
9667 if (v->str)
9668 PyObject_DEL(v->str);
9669 Py_XDECREF(v->defenc);
9670 PyObject_Del(v);
9671 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009672 }
9673 free_list = NULL;
9674 assert(numfree == 0);
9675 return freelist_size;
9676}
9677
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678void
Thomas Wouters78890102000-07-22 19:25:51 +00009679_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009681 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009683 Py_XDECREF(unicode_empty);
9684 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009685
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009686 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 if (unicode_latin1[i]) {
9688 Py_DECREF(unicode_latin1[i]);
9689 unicode_latin1[i] = NULL;
9690 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009691 }
Christian Heimesa156e092008-02-16 07:38:31 +00009692 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009694
Walter Dörwald16807132007-05-25 13:52:07 +00009695void
9696PyUnicode_InternInPlace(PyObject **p)
9697{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9699 PyObject *t;
9700 if (s == NULL || !PyUnicode_Check(s))
9701 Py_FatalError(
9702 "PyUnicode_InternInPlace: unicode strings only please!");
9703 /* If it's a subclass, we don't really know what putting
9704 it in the interned dict might do. */
9705 if (!PyUnicode_CheckExact(s))
9706 return;
9707 if (PyUnicode_CHECK_INTERNED(s))
9708 return;
9709 if (interned == NULL) {
9710 interned = PyDict_New();
9711 if (interned == NULL) {
9712 PyErr_Clear(); /* Don't leave an exception */
9713 return;
9714 }
9715 }
9716 /* It might be that the GetItem call fails even
9717 though the key is present in the dictionary,
9718 namely when this happens during a stack overflow. */
9719 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009721 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009722
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 if (t) {
9724 Py_INCREF(t);
9725 Py_DECREF(*p);
9726 *p = t;
9727 return;
9728 }
Walter Dörwald16807132007-05-25 13:52:07 +00009729
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 PyThreadState_GET()->recursion_critical = 1;
9731 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9732 PyErr_Clear();
9733 PyThreadState_GET()->recursion_critical = 0;
9734 return;
9735 }
9736 PyThreadState_GET()->recursion_critical = 0;
9737 /* The two references in interned are not counted by refcnt.
9738 The deallocator will take care of this */
9739 Py_REFCNT(s) -= 2;
9740 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009741}
9742
9743void
9744PyUnicode_InternImmortal(PyObject **p)
9745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009746 PyUnicode_InternInPlace(p);
9747 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9748 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9749 Py_INCREF(*p);
9750 }
Walter Dörwald16807132007-05-25 13:52:07 +00009751}
9752
9753PyObject *
9754PyUnicode_InternFromString(const char *cp)
9755{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 PyObject *s = PyUnicode_FromString(cp);
9757 if (s == NULL)
9758 return NULL;
9759 PyUnicode_InternInPlace(&s);
9760 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009761}
9762
9763void _Py_ReleaseInternedUnicodeStrings(void)
9764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 PyObject *keys;
9766 PyUnicodeObject *s;
9767 Py_ssize_t i, n;
9768 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009769
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 if (interned == NULL || !PyDict_Check(interned))
9771 return;
9772 keys = PyDict_Keys(interned);
9773 if (keys == NULL || !PyList_Check(keys)) {
9774 PyErr_Clear();
9775 return;
9776 }
Walter Dörwald16807132007-05-25 13:52:07 +00009777
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9779 detector, interned unicode strings are not forcibly deallocated;
9780 rather, we give them their stolen references back, and then clear
9781 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009782
Benjamin Peterson14339b62009-01-31 16:36:08 +00009783 n = PyList_GET_SIZE(keys);
9784 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 for (i = 0; i < n; i++) {
9787 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9788 switch (s->state) {
9789 case SSTATE_NOT_INTERNED:
9790 /* XXX Shouldn't happen */
9791 break;
9792 case SSTATE_INTERNED_IMMORTAL:
9793 Py_REFCNT(s) += 1;
9794 immortal_size += s->length;
9795 break;
9796 case SSTATE_INTERNED_MORTAL:
9797 Py_REFCNT(s) += 2;
9798 mortal_size += s->length;
9799 break;
9800 default:
9801 Py_FatalError("Inconsistent interned string state.");
9802 }
9803 s->state = SSTATE_NOT_INTERNED;
9804 }
9805 fprintf(stderr, "total size of all interned strings: "
9806 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9807 "mortal/immortal\n", mortal_size, immortal_size);
9808 Py_DECREF(keys);
9809 PyDict_Clear(interned);
9810 Py_DECREF(interned);
9811 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009812}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009813
9814
9815/********************* Unicode Iterator **************************/
9816
9817typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009818 PyObject_HEAD
9819 Py_ssize_t it_index;
9820 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009821} unicodeiterobject;
9822
9823static void
9824unicodeiter_dealloc(unicodeiterobject *it)
9825{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826 _PyObject_GC_UNTRACK(it);
9827 Py_XDECREF(it->it_seq);
9828 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009829}
9830
9831static int
9832unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 Py_VISIT(it->it_seq);
9835 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009836}
9837
9838static PyObject *
9839unicodeiter_next(unicodeiterobject *it)
9840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009841 PyUnicodeObject *seq;
9842 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009843
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844 assert(it != NULL);
9845 seq = it->it_seq;
9846 if (seq == NULL)
9847 return NULL;
9848 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009849
Benjamin Peterson14339b62009-01-31 16:36:08 +00009850 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9851 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 if (item != NULL)
9854 ++it->it_index;
9855 return item;
9856 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009857
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 Py_DECREF(seq);
9859 it->it_seq = NULL;
9860 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009861}
9862
9863static PyObject *
9864unicodeiter_len(unicodeiterobject *it)
9865{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009866 Py_ssize_t len = 0;
9867 if (it->it_seq)
9868 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9869 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009870}
9871
9872PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9873
9874static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009875 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009878};
9879
9880PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9882 "str_iterator", /* tp_name */
9883 sizeof(unicodeiterobject), /* tp_basicsize */
9884 0, /* tp_itemsize */
9885 /* methods */
9886 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9887 0, /* tp_print */
9888 0, /* tp_getattr */
9889 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009890 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 0, /* tp_repr */
9892 0, /* tp_as_number */
9893 0, /* tp_as_sequence */
9894 0, /* tp_as_mapping */
9895 0, /* tp_hash */
9896 0, /* tp_call */
9897 0, /* tp_str */
9898 PyObject_GenericGetAttr, /* tp_getattro */
9899 0, /* tp_setattro */
9900 0, /* tp_as_buffer */
9901 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9902 0, /* tp_doc */
9903 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9904 0, /* tp_clear */
9905 0, /* tp_richcompare */
9906 0, /* tp_weaklistoffset */
9907 PyObject_SelfIter, /* tp_iter */
9908 (iternextfunc)unicodeiter_next, /* tp_iternext */
9909 unicodeiter_methods, /* tp_methods */
9910 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009911};
9912
9913static PyObject *
9914unicode_iter(PyObject *seq)
9915{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009916 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009917
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 if (!PyUnicode_Check(seq)) {
9919 PyErr_BadInternalCall();
9920 return NULL;
9921 }
9922 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9923 if (it == NULL)
9924 return NULL;
9925 it->it_index = 0;
9926 Py_INCREF(seq);
9927 it->it_seq = (PyUnicodeObject *)seq;
9928 _PyObject_GC_TRACK(it);
9929 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009930}
9931
Martin v. Löwis5b222132007-06-10 09:51:05 +00009932size_t
9933Py_UNICODE_strlen(const Py_UNICODE *u)
9934{
9935 int res = 0;
9936 while(*u++)
9937 res++;
9938 return res;
9939}
9940
9941Py_UNICODE*
9942Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9943{
9944 Py_UNICODE *u = s1;
9945 while ((*u++ = *s2++));
9946 return s1;
9947}
9948
9949Py_UNICODE*
9950Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9951{
9952 Py_UNICODE *u = s1;
9953 while ((*u++ = *s2++))
9954 if (n-- == 0)
9955 break;
9956 return s1;
9957}
9958
9959int
9960Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9961{
9962 while (*s1 && *s2 && *s1 == *s2)
9963 s1++, s2++;
9964 if (*s1 && *s2)
9965 return (*s1 < *s2) ? -1 : +1;
9966 if (*s1)
9967 return 1;
9968 if (*s2)
9969 return -1;
9970 return 0;
9971}
9972
9973Py_UNICODE*
9974Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9975{
9976 const Py_UNICODE *p;
9977 for (p = s; *p; p++)
9978 if (*p == c)
9979 return (Py_UNICODE*)p;
9980 return NULL;
9981}
9982
9983
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009984#ifdef __cplusplus
9985}
9986#endif
9987
9988
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009989/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 Local variables:
9991 c-basic-offset: 4
9992 indent-tabs-mode: nil
9993 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009994*/