blob: fc5c1c536ed952f9b978739f1304fc58544282b8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000794 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* Convert the argument to a bytes object, according to the file
1534 system encoding */
1535
1536int
1537PyUnicode_FSConverter(PyObject* arg, void* addr)
1538{
1539 PyObject *output = NULL;
1540 Py_ssize_t size;
1541 void *data;
1542 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1543 output = arg;
1544 Py_INCREF(output);
1545 }
1546 else {
1547 arg = PyUnicode_FromObject(arg);
1548 if (!arg)
1549 return 0;
1550 output = PyUnicode_AsEncodedObject(arg,
1551 Py_FileSystemDefaultEncoding,
1552 "utf8b");
1553 Py_DECREF(arg);
1554 if (!output)
1555 return 0;
1556 if (!PyBytes_Check(output)) {
1557 Py_DECREF(output);
1558 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1559 return 0;
1560 }
1561 }
1562 if (PyBytes_Check(output)) {
1563 size = PyBytes_GET_SIZE(output);
1564 data = PyBytes_AS_STRING(output);
1565 }
1566 else {
1567 size = PyByteArray_GET_SIZE(output);
1568 data = PyByteArray_AS_STRING(output);
1569 }
1570 if (size != strlen(data)) {
1571 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1572 Py_DECREF(output);
1573 return 0;
1574 }
1575 *(PyObject**)addr = output;
1576 return 1;
1577}
1578
1579
Martin v. Löwis5b222132007-06-10 09:51:05 +00001580char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001581_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001582{
Christian Heimesf3863112007-11-22 07:46:41 +00001583 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001584 if (!PyUnicode_Check(unicode)) {
1585 PyErr_BadArgument();
1586 return NULL;
1587 }
Christian Heimesf3863112007-11-22 07:46:41 +00001588 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1589 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001590 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001591 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001592 *psize = PyBytes_GET_SIZE(bytes);
1593 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001594}
1595
1596char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001597_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001599 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001600}
1601
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1603{
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 goto onError;
1607 }
1608 return PyUnicode_AS_UNICODE(unicode);
1609
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 return NULL;
1612}
1613
Martin v. Löwis18e16552006-02-15 17:27:45 +00001614Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 goto onError;
1619 }
1620 return PyUnicode_GET_SIZE(unicode);
1621
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return -1;
1624}
1625
Thomas Wouters78890102000-07-22 19:25:51 +00001626const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001627{
1628 return unicode_default_encoding;
1629}
1630
1631int PyUnicode_SetDefaultEncoding(const char *encoding)
1632{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001633 if (strcmp(encoding, unicode_default_encoding) != 0) {
1634 PyErr_Format(PyExc_ValueError,
1635 "Can only set default encoding to %s",
1636 unicode_default_encoding);
1637 return -1;
1638 }
Fred Drakee4315f52000-05-09 19:53:39 +00001639 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001640}
1641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642/* error handling callback helper:
1643 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001644 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001645 and adjust various state variables.
1646 return 0 on success, -1 on error
1647*/
1648
1649static
1650int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 const char *encoding, const char *reason,
1652 const char **input, const char **inend, Py_ssize_t *startinpos,
1653 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1654 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001656 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657
1658 PyObject *restuple = NULL;
1659 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001660 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001661 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001662 Py_ssize_t requiredsize;
1663 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001665 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 int res = -1;
1668
1669 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 *errorHandler = PyCodec_LookupError(errors);
1671 if (*errorHandler == NULL)
1672 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 }
1674
1675 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001676 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1678 if (*exceptionObject == NULL)
1679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 }
1681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001682 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1683 goto onError;
1684 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1685 goto onError;
1686 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 }
1689
1690 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1691 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001692 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001694 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001696 }
1697 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001698 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001699
1700 /* Copy back the bytes variables, which might have been modified by the
1701 callback */
1702 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1703 if (!inputobj)
1704 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001705 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001706 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001707 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001708 *input = PyBytes_AS_STRING(inputobj);
1709 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001710 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001711 /* we can DECREF safely, as the exception has another reference,
1712 so the object won't go away. */
1713 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001716 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001717 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001718 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1719 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721
1722 /* need more space? (at least enough for what we
1723 have+the replacement+the rest of the string (starting
1724 at the new input position), so we won't have to check space
1725 when there are no errors in the rest of the string) */
1726 repptr = PyUnicode_AS_UNICODE(repunicode);
1727 repsize = PyUnicode_GET_SIZE(repunicode);
1728 requiredsize = *outpos + repsize + insize-newpos;
1729 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001730 if (requiredsize<2*outsize)
1731 requiredsize = 2*outsize;
1732 if (_PyUnicode_Resize(output, requiredsize) < 0)
1733 goto onError;
1734 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 }
1736 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001737 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 Py_UNICODE_COPY(*outptr, repptr, repsize);
1739 *outptr += repsize;
1740 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 /* we made it! */
1743 res = 0;
1744
Benjamin Peterson29060642009-01-31 22:14:21 +00001745 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 Py_XDECREF(restuple);
1747 return res;
1748}
1749
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750/* --- UTF-7 Codec -------------------------------------------------------- */
1751
Antoine Pitrou244651a2009-05-04 18:56:13 +00001752/* See RFC2152 for details. We encode conservatively and decode liberally. */
1753
1754/* Three simple macros defining base-64. */
1755
1756/* Is c a base-64 character? */
1757
1758#define IS_BASE64(c) \
1759 (((c) >= 'A' && (c) <= 'Z') || \
1760 ((c) >= 'a' && (c) <= 'z') || \
1761 ((c) >= '0' && (c) <= '9') || \
1762 (c) == '+' || (c) == '/')
1763
1764/* given that c is a base-64 character, what is its base-64 value? */
1765
1766#define FROM_BASE64(c) \
1767 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1768 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1769 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1770 (c) == '+' ? 62 : 63)
1771
1772/* What is the base-64 character of the bottom 6 bits of n? */
1773
1774#define TO_BASE64(n) \
1775 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1776
1777/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1778 * decoded as itself. We are permissive on decoding; the only ASCII
1779 * byte not decoding to itself is the + which begins a base64
1780 * string. */
1781
1782#define DECODE_DIRECT(c) \
1783 ((c) <= 127 && (c) != '+')
1784
1785/* The UTF-7 encoder treats ASCII characters differently according to
1786 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1787 * the above). See RFC2152. This array identifies these different
1788 * sets:
1789 * 0 : "Set D"
1790 * alphanumeric and '(),-./:?
1791 * 1 : "Set O"
1792 * !"#$%&*;<=>@[]^_`{|}
1793 * 2 : "whitespace"
1794 * ht nl cr sp
1795 * 3 : special (must be base64 encoded)
1796 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1797 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798
Tim Petersced69f82003-09-16 20:30:58 +00001799static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001800char utf7_category[128] = {
1801/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1802 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1803/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1804 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1805/* sp ! " # $ % & ' ( ) * + , - . / */
1806 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1807/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1809/* @ A B C D E F G H I J K L M N O */
1810 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1811/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1813/* ` a b c d e f g h i j k l m n o */
1814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1815/* p q r s t u v w x y z { | } ~ del */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817};
1818
Antoine Pitrou244651a2009-05-04 18:56:13 +00001819/* ENCODE_DIRECT: this character should be encoded as itself. The
1820 * answer depends on whether we are encoding set O as itself, and also
1821 * on whether we are encoding whitespace as itself. RFC2152 makes it
1822 * clear that the answers to these questions vary between
1823 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001824
Antoine Pitrou244651a2009-05-04 18:56:13 +00001825#define ENCODE_DIRECT(c, directO, directWS) \
1826 ((c) < 128 && (c) > 0 && \
1827 ((utf7_category[(c)] == 0) || \
1828 (directWS && (utf7_category[(c)] == 2)) || \
1829 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001831PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 Py_ssize_t size,
1833 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001835 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1836}
1837
Antoine Pitrou244651a2009-05-04 18:56:13 +00001838/* The decoder. The only state we preserve is our read position,
1839 * i.e. how many characters we have consumed. So if we end in the
1840 * middle of a shift sequence we have to back off the read position
1841 * and the output to the beginning of the sequence, otherwise we lose
1842 * all the shift state (seen bits, number of bits seen, high
1843 * surrogate). */
1844
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001845PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 Py_ssize_t size,
1847 const char *errors,
1848 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001851 Py_ssize_t startinpos;
1852 Py_ssize_t endinpos;
1853 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854 const char *e;
1855 PyUnicodeObject *unicode;
1856 Py_UNICODE *p;
1857 const char *errmsg = "";
1858 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001859 Py_UNICODE *shiftOutStart;
1860 unsigned int base64bits = 0;
1861 unsigned long base64buffer = 0;
1862 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 PyObject *errorHandler = NULL;
1864 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865
1866 unicode = _PyUnicode_New(size);
1867 if (!unicode)
1868 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001869 if (size == 0) {
1870 if (consumed)
1871 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001873 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001874
1875 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001876 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 e = s + size;
1878
1879 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001882 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883
Antoine Pitrou244651a2009-05-04 18:56:13 +00001884 if (inShift) { /* in a base-64 section */
1885 if (IS_BASE64(ch)) { /* consume a base-64 character */
1886 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1887 base64bits += 6;
1888 s++;
1889 if (base64bits >= 16) {
1890 /* we have enough bits for a UTF-16 value */
1891 Py_UNICODE outCh = (Py_UNICODE)
1892 (base64buffer >> (base64bits-16));
1893 base64bits -= 16;
1894 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1895 if (surrogate) {
1896 /* expecting a second surrogate */
1897 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1898#ifdef Py_UNICODE_WIDE
1899 *p++ = (((surrogate & 0x3FF)<<10)
1900 | (outCh & 0x3FF)) + 0x10000;
1901#else
1902 *p++ = surrogate;
1903 *p++ = outCh;
1904#endif
1905 surrogate = 0;
1906 }
1907 else {
1908 surrogate = 0;
1909 errmsg = "second surrogate missing";
1910 goto utf7Error;
1911 }
1912 }
1913 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1914 /* first surrogate */
1915 surrogate = outCh;
1916 }
1917 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1918 errmsg = "unexpected second surrogate";
1919 goto utf7Error;
1920 }
1921 else {
1922 *p++ = outCh;
1923 }
1924 }
1925 }
1926 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001927 inShift = 0;
1928 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001929 if (surrogate) {
1930 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001931 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001933 if (base64bits > 0) { /* left-over bits */
1934 if (base64bits >= 6) {
1935 /* We've seen at least one base-64 character */
1936 errmsg = "partial character in shift sequence";
1937 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001938 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001939 else {
1940 /* Some bits remain; they should be zero */
1941 if (base64buffer != 0) {
1942 errmsg = "non-zero padding bits in shift sequence";
1943 goto utf7Error;
1944 }
1945 }
1946 }
1947 if (ch != '-') {
1948 /* '-' is absorbed; other terminating
1949 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950 *p++ = ch;
1951 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001952 }
1953 }
1954 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001956 s++; /* consume '+' */
1957 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001958 s++;
1959 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960 }
1961 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001963 shiftOutStart = p;
1964 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965 }
1966 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968 *p++ = ch;
1969 s++;
1970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001971 else {
1972 startinpos = s-starts;
1973 s++;
1974 errmsg = "unexpected special character";
1975 goto utf7Error;
1976 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001977 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001978utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 outpos = p-PyUnicode_AS_UNICODE(unicode);
1980 endinpos = s-starts;
1981 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001982 errors, &errorHandler,
1983 "utf7", errmsg,
1984 &starts, &e, &startinpos, &endinpos, &exc, &s,
1985 &unicode, &outpos, &p))
1986 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987 }
1988
Antoine Pitrou244651a2009-05-04 18:56:13 +00001989 /* end of string */
1990
1991 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1992 /* if we're in an inconsistent state, that's an error */
1993 if (surrogate ||
1994 (base64bits >= 6) ||
1995 (base64bits > 0 && base64buffer != 0)) {
1996 outpos = p-PyUnicode_AS_UNICODE(unicode);
1997 endinpos = size;
1998 if (unicode_decode_call_errorhandler(
1999 errors, &errorHandler,
2000 "utf7", "unterminated shift sequence",
2001 &starts, &e, &startinpos, &endinpos, &exc, &s,
2002 &unicode, &outpos, &p))
2003 goto onError;
2004 if (s < e)
2005 goto restart;
2006 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002007 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002008
2009 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002010 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002011 if (inShift) {
2012 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002013 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014 }
2015 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002016 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002017 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002018 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002019
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002020 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 goto onError;
2022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 Py_XDECREF(errorHandler);
2024 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 return (PyObject *)unicode;
2026
Benjamin Peterson29060642009-01-31 22:14:21 +00002027 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 Py_XDECREF(errorHandler);
2029 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030 Py_DECREF(unicode);
2031 return NULL;
2032}
2033
2034
2035PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002036 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002037 int base64SetO,
2038 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002041 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002042 /* It might be possible to tighten this worst case */
Antoine Pitrou244651a2009-05-04 18:56:13 +00002043 Py_ssize_t allocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002045 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002046 unsigned int base64bits = 0;
2047 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 char * out;
2049 char * start;
2050
2051 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053
Antoine Pitrou244651a2009-05-04 18:56:13 +00002054 if (allocated / 5 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002055 return PyErr_NoMemory();
2056
Antoine Pitrou244651a2009-05-04 18:56:13 +00002057 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 if (v == NULL)
2059 return NULL;
2060
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002061 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 for (;i < size; ++i) {
2063 Py_UNICODE ch = s[i];
2064
Antoine Pitrou244651a2009-05-04 18:56:13 +00002065 if (inShift) {
2066 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2067 /* shifting out */
2068 if (base64bits) { /* output remaining bits */
2069 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2070 base64buffer = 0;
2071 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 }
2073 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 /* Characters not in the BASE64 set implicitly unshift the sequence
2075 so no '-' is required, except if the character is itself a '-' */
2076 if (IS_BASE64(ch) || ch == '-') {
2077 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002079 *out++ = (char) ch;
2080 }
2081 else {
2082 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002084 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 else { /* not in a shift sequence */
2086 if (ch == '+') {
2087 *out++ = '+';
2088 *out++ = '-';
2089 }
2090 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2091 *out++ = (char) ch;
2092 }
2093 else {
2094 *out++ = '+';
2095 inShift = 1;
2096 goto encode_char;
2097 }
2098 }
2099 continue;
2100encode_char:
2101#ifdef Py_UNICODE_WIDE
2102 if (ch >= 0x10000) {
2103 /* code first surrogate */
2104 base64bits += 16;
2105 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2106 while (base64bits >= 6) {
2107 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2108 base64bits -= 6;
2109 }
2110 /* prepare second surrogate */
2111 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2112 }
2113#endif
2114 base64bits += 16;
2115 base64buffer = (base64buffer << 16) | ch;
2116 while (base64bits >= 6) {
2117 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2118 base64bits -= 6;
2119 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002120 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002121 if (base64bits)
2122 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2123 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002125 if (_PyBytes_Resize(&v, out - start) < 0)
2126 return NULL;
2127 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128}
2129
Antoine Pitrou244651a2009-05-04 18:56:13 +00002130#undef IS_BASE64
2131#undef FROM_BASE64
2132#undef TO_BASE64
2133#undef DECODE_DIRECT
2134#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136/* --- UTF-8 Codec -------------------------------------------------------- */
2137
Tim Petersced69f82003-09-16 20:30:58 +00002138static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139char utf8_code_length[256] = {
2140 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2141 illegal prefix. see RFC 2279 for details */
2142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2152 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2154 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2155 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2156 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2157 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2158};
2159
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 Py_ssize_t size,
2162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163{
Walter Dörwald69652032004-09-07 20:24:22 +00002164 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2165}
2166
Antoine Pitrouab868312009-01-10 15:40:25 +00002167/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2168#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2169
2170/* Mask to quickly check whether a C 'long' contains a
2171 non-ASCII, UTF8-encoded char. */
2172#if (SIZEOF_LONG == 8)
2173# define ASCII_CHAR_MASK 0x8080808080808080L
2174#elif (SIZEOF_LONG == 4)
2175# define ASCII_CHAR_MASK 0x80808080L
2176#else
2177# error C 'long' size should be either 4 or 8!
2178#endif
2179
Walter Dörwald69652032004-09-07 20:24:22 +00002180PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002181 Py_ssize_t size,
2182 const char *errors,
2183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002185 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002187 Py_ssize_t startinpos;
2188 Py_ssize_t endinpos;
2189 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002190 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 PyUnicodeObject *unicode;
2192 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002193 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002194 PyObject *errorHandler = NULL;
2195 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
2197 /* Note: size will always be longer than the resulting Unicode
2198 character count */
2199 unicode = _PyUnicode_New(size);
2200 if (!unicode)
2201 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002202 if (size == 0) {
2203 if (consumed)
2204 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207
2208 /* Unpack UTF-8 encoded data */
2209 p = unicode->str;
2210 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002211 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
2213 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002214 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002217 /* Fast path for runs of ASCII characters. Given that common UTF-8
2218 input will consist of an overwhelming majority of ASCII
2219 characters, we try to optimize for this case by checking
2220 as many characters as a C 'long' can contain.
2221 First, check if we can do an aligned read, as most CPUs have
2222 a penalty for unaligned reads.
2223 */
2224 if (!((size_t) s & LONG_PTR_MASK)) {
2225 /* Help register allocation */
2226 register const char *_s = s;
2227 register Py_UNICODE *_p = p;
2228 while (_s < aligned_end) {
2229 /* Read a whole long at a time (either 4 or 8 bytes),
2230 and do a fast unrolled copy if it only contains ASCII
2231 characters. */
2232 unsigned long data = *(unsigned long *) _s;
2233 if (data & ASCII_CHAR_MASK)
2234 break;
2235 _p[0] = (unsigned char) _s[0];
2236 _p[1] = (unsigned char) _s[1];
2237 _p[2] = (unsigned char) _s[2];
2238 _p[3] = (unsigned char) _s[3];
2239#if (SIZEOF_LONG == 8)
2240 _p[4] = (unsigned char) _s[4];
2241 _p[5] = (unsigned char) _s[5];
2242 _p[6] = (unsigned char) _s[6];
2243 _p[7] = (unsigned char) _s[7];
2244#endif
2245 _s += SIZEOF_LONG;
2246 _p += SIZEOF_LONG;
2247 }
2248 s = _s;
2249 p = _p;
2250 if (s == e)
2251 break;
2252 ch = (unsigned char)*s;
2253 }
2254 }
2255
2256 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002257 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 s++;
2259 continue;
2260 }
2261
2262 n = utf8_code_length[ch];
2263
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002264 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 if (consumed)
2266 break;
2267 else {
2268 errmsg = "unexpected end of data";
2269 startinpos = s-starts;
2270 endinpos = size;
2271 goto utf8Error;
2272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274
2275 switch (n) {
2276
2277 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002278 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002279 startinpos = s-starts;
2280 endinpos = startinpos+1;
2281 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282
2283 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002284 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 startinpos = s-starts;
2286 endinpos = startinpos+1;
2287 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288
2289 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002290 if ((s[1] & 0xc0) != 0x80) {
2291 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 startinpos = s-starts;
2293 endinpos = startinpos+2;
2294 goto utf8Error;
2295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002297 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 startinpos = s-starts;
2299 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002300 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 goto utf8Error;
2302 }
2303 else
2304 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 break;
2306
2307 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002308 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002309 (s[2] & 0xc0) != 0x80) {
2310 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002311 startinpos = s-starts;
2312 endinpos = startinpos+3;
2313 goto utf8Error;
2314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002316 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002317 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002318 startinpos = s-starts;
2319 endinpos = startinpos+3;
2320 goto utf8Error;
2321 }
2322 else
2323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002324 break;
2325
2326 case 4:
2327 if ((s[1] & 0xc0) != 0x80 ||
2328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002329 (s[3] & 0xc0) != 0x80) {
2330 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 startinpos = s-starts;
2332 endinpos = startinpos+4;
2333 goto utf8Error;
2334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002338 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002340 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002341 UTF-16 */
2342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002343 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002344 startinpos = s-starts;
2345 endinpos = startinpos+4;
2346 goto utf8Error;
2347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002348#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002353 /* translate from 10000..10FFFF to 0..FFFF */
2354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002356 /* high surrogate = top 10 bits added to D800 */
2357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 break;
2363
2364 default:
2365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002366 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 startinpos = s-starts;
2368 endinpos = startinpos+n;
2369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 }
2371 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002373
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 utf8Error:
2375 outpos = p-PyUnicode_AS_UNICODE(unicode);
2376 if (unicode_decode_call_errorhandler(
2377 errors, &errorHandler,
2378 "utf8", errmsg,
2379 &starts, &e, &startinpos, &endinpos, &exc, &s,
2380 &unicode, &outpos, &p))
2381 goto onError;
2382 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 }
Walter Dörwald69652032004-09-07 20:24:22 +00002384 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002388 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 goto onError;
2390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 Py_XDECREF(errorHandler);
2392 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 return (PyObject *)unicode;
2394
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 Py_XDECREF(errorHandler);
2397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 Py_DECREF(unicode);
2399 return NULL;
2400}
2401
Antoine Pitrouab868312009-01-10 15:40:25 +00002402#undef ASCII_CHAR_MASK
2403
2404
Tim Peters602f7402002-04-27 18:03:26 +00002405/* Allocation strategy: if the string is short, convert into a stack buffer
2406 and allocate exactly as much space needed at the end. Else allocate the
2407 maximum possible needed (4 result bytes per Unicode character), and return
2408 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002409*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002410PyObject *
2411PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 Py_ssize_t size,
2413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414{
Tim Peters602f7402002-04-27 18:03:26 +00002415#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002416
Guido van Rossum98297ee2007-11-06 21:34:58 +00002417 Py_ssize_t i; /* index into s of next input byte */
2418 PyObject *result; /* result string object */
2419 char *p; /* next free byte in output buffer */
2420 Py_ssize_t nallocated; /* number of result bytes allocated */
2421 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002422 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002423 PyObject *errorHandler = NULL;
2424 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002425
Tim Peters602f7402002-04-27 18:03:26 +00002426 assert(s != NULL);
2427 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428
Tim Peters602f7402002-04-27 18:03:26 +00002429 if (size <= MAX_SHORT_UNICHARS) {
2430 /* Write into the stack buffer; nallocated can't overflow.
2431 * At the end, we'll allocate exactly as much heap space as it
2432 * turns out we need.
2433 */
2434 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002435 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002436 p = stackbuf;
2437 }
2438 else {
2439 /* Overallocate on the heap, and give the excess back at the end. */
2440 nallocated = size * 4;
2441 if (nallocated / 4 != size) /* overflow! */
2442 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002443 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002444 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002445 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002446 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002447 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002448
Tim Peters602f7402002-04-27 18:03:26 +00002449 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002450 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002451
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002452 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002453 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002457 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002458 *p++ = (char)(0xc0 | (ch >> 6));
2459 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002460 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002461 else {
Tim Peters602f7402002-04-27 18:03:26 +00002462 /* Encode UCS2 Unicode ordinals */
2463 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002464#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002465 /* Special case: check for high surrogate */
2466 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2467 Py_UCS4 ch2 = s[i];
2468 /* Check for low surrogate and combine the two to
2469 form a UCS4 value */
2470 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002471 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002472 i++;
2473 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002474 }
Tim Peters602f7402002-04-27 18:03:26 +00002475 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002476 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002477#endif
2478 if (ch >= 0xd800 && ch <= 0xdfff) {
2479 Py_ssize_t newpos;
2480 PyObject *rep;
2481 char *prep;
2482 int k;
2483 rep = unicode_encode_call_errorhandler
2484 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2485 s, size, &exc, i-1, i, &newpos);
2486 if (!rep)
2487 goto error;
2488 /* Implementation limitations: only support error handler that return
2489 bytes, and only support up to four replacement bytes. */
2490 if (!PyBytes_Check(rep)) {
2491 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2492 Py_DECREF(rep);
2493 goto error;
2494 }
2495 if (PyBytes_Size(rep) > 4) {
2496 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2497 Py_DECREF(rep);
2498 goto error;
2499 }
2500 prep = PyBytes_AsString(rep);
2501 for(k = PyBytes_Size(rep); k > 0; k--)
2502 *p++ = *prep++;
2503 Py_DECREF(rep);
2504 continue;
2505
2506 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002507 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002508 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2509 *p++ = (char)(0x80 | (ch & 0x3f));
2510 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002513 /* Encode UCS4 Unicode ordinals */
2514 *p++ = (char)(0xf0 | (ch >> 18));
2515 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2516 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2517 *p++ = (char)(0x80 | (ch & 0x3f));
2518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002520
Guido van Rossum98297ee2007-11-06 21:34:58 +00002521 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002522 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002523 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002524 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002525 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002526 }
2527 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002528 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002529 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002530 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002533 Py_XDECREF(errorHandler);
2534 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002535 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002536 error:
2537 Py_XDECREF(errorHandler);
2538 Py_XDECREF(exc);
2539 Py_XDECREF(result);
2540 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002541
Tim Peters602f7402002-04-27 18:03:26 +00002542#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543}
2544
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (!PyUnicode_Check(unicode)) {
2548 PyErr_BadArgument();
2549 return NULL;
2550 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002551 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002552 PyUnicode_GET_SIZE(unicode),
2553 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554}
2555
Walter Dörwald41980ca2007-08-16 21:55:45 +00002556/* --- UTF-32 Codec ------------------------------------------------------- */
2557
2558PyObject *
2559PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 Py_ssize_t size,
2561 const char *errors,
2562 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002563{
2564 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2565}
2566
2567PyObject *
2568PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002569 Py_ssize_t size,
2570 const char *errors,
2571 int *byteorder,
2572 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002573{
2574 const char *starts = s;
2575 Py_ssize_t startinpos;
2576 Py_ssize_t endinpos;
2577 Py_ssize_t outpos;
2578 PyUnicodeObject *unicode;
2579 Py_UNICODE *p;
2580#ifndef Py_UNICODE_WIDE
2581 int i, pairs;
2582#else
2583 const int pairs = 0;
2584#endif
2585 const unsigned char *q, *e;
2586 int bo = 0; /* assume native ordering by default */
2587 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002588 /* Offsets from q for retrieving bytes in the right order. */
2589#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590 int iorder[] = {0, 1, 2, 3};
2591#else
2592 int iorder[] = {3, 2, 1, 0};
2593#endif
2594 PyObject *errorHandler = NULL;
2595 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002596 /* On narrow builds we split characters outside the BMP into two
2597 codepoints => count how much extra space we need. */
2598#ifndef Py_UNICODE_WIDE
2599 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 if (((Py_UCS4 *)s)[i] >= 0x10000)
2601 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002602#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002603
2604 /* This might be one to much, because of a BOM */
2605 unicode = _PyUnicode_New((size+3)/4+pairs);
2606 if (!unicode)
2607 return NULL;
2608 if (size == 0)
2609 return (PyObject *)unicode;
2610
2611 /* Unpack UTF-32 encoded data */
2612 p = unicode->str;
2613 q = (unsigned char *)s;
2614 e = q + size;
2615
2616 if (byteorder)
2617 bo = *byteorder;
2618
2619 /* Check for BOM marks (U+FEFF) in the input and adjust current
2620 byte order setting accordingly. In native mode, the leading BOM
2621 mark is skipped, in all other modes, it is copied to the output
2622 stream as-is (giving a ZWNBSP character). */
2623 if (bo == 0) {
2624 if (size >= 4) {
2625 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002627#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 if (bom == 0x0000FEFF) {
2629 q += 4;
2630 bo = -1;
2631 }
2632 else if (bom == 0xFFFE0000) {
2633 q += 4;
2634 bo = 1;
2635 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002636#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002637 if (bom == 0x0000FEFF) {
2638 q += 4;
2639 bo = 1;
2640 }
2641 else if (bom == 0xFFFE0000) {
2642 q += 4;
2643 bo = -1;
2644 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002645#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002646 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647 }
2648
2649 if (bo == -1) {
2650 /* force LE */
2651 iorder[0] = 0;
2652 iorder[1] = 1;
2653 iorder[2] = 2;
2654 iorder[3] = 3;
2655 }
2656 else if (bo == 1) {
2657 /* force BE */
2658 iorder[0] = 3;
2659 iorder[1] = 2;
2660 iorder[2] = 1;
2661 iorder[3] = 0;
2662 }
2663
2664 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 Py_UCS4 ch;
2666 /* remaining bytes at the end? (size should be divisible by 4) */
2667 if (e-q<4) {
2668 if (consumed)
2669 break;
2670 errmsg = "truncated data";
2671 startinpos = ((const char *)q)-starts;
2672 endinpos = ((const char *)e)-starts;
2673 goto utf32Error;
2674 /* The remaining input chars are ignored if the callback
2675 chooses to skip the input */
2676 }
2677 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2678 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002679
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 if (ch >= 0x110000)
2681 {
2682 errmsg = "codepoint not in range(0x110000)";
2683 startinpos = ((const char *)q)-starts;
2684 endinpos = startinpos+4;
2685 goto utf32Error;
2686 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002687#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 if (ch >= 0x10000)
2689 {
2690 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2691 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 }
2693 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002694#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002695 *p++ = ch;
2696 q += 4;
2697 continue;
2698 utf32Error:
2699 outpos = p-PyUnicode_AS_UNICODE(unicode);
2700 if (unicode_decode_call_errorhandler(
2701 errors, &errorHandler,
2702 "utf32", errmsg,
2703 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2704 &unicode, &outpos, &p))
2705 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002706 }
2707
2708 if (byteorder)
2709 *byteorder = bo;
2710
2711 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002713
2714 /* Adjust length */
2715 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2716 goto onError;
2717
2718 Py_XDECREF(errorHandler);
2719 Py_XDECREF(exc);
2720 return (PyObject *)unicode;
2721
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002723 Py_DECREF(unicode);
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return NULL;
2727}
2728
2729PyObject *
2730PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 Py_ssize_t size,
2732 const char *errors,
2733 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002735 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002736 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002737 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002738#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002739 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740#else
2741 const int pairs = 0;
2742#endif
2743 /* Offsets from p for storing byte pairs in the right order. */
2744#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2745 int iorder[] = {0, 1, 2, 3};
2746#else
2747 int iorder[] = {3, 2, 1, 0};
2748#endif
2749
Benjamin Peterson29060642009-01-31 22:14:21 +00002750#define STORECHAR(CH) \
2751 do { \
2752 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2753 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2754 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2755 p[iorder[0]] = (CH) & 0xff; \
2756 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002757 } while(0)
2758
2759 /* In narrow builds we can output surrogate pairs as one codepoint,
2760 so we need less space. */
2761#ifndef Py_UNICODE_WIDE
2762 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2764 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2765 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002767 nsize = (size - pairs + (byteorder == 0));
2768 bytesize = nsize * 4;
2769 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002771 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772 if (v == NULL)
2773 return NULL;
2774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002775 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002779 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780
2781 if (byteorder == -1) {
2782 /* force LE */
2783 iorder[0] = 0;
2784 iorder[1] = 1;
2785 iorder[2] = 2;
2786 iorder[3] = 3;
2787 }
2788 else if (byteorder == 1) {
2789 /* force BE */
2790 iorder[0] = 3;
2791 iorder[1] = 2;
2792 iorder[2] = 1;
2793 iorder[3] = 0;
2794 }
2795
2796 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002798#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2800 Py_UCS4 ch2 = *s;
2801 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2802 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2803 s++;
2804 size--;
2805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807#endif
2808 STORECHAR(ch);
2809 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002810
2811 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002812 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002813#undef STORECHAR
2814}
2815
2816PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2817{
2818 if (!PyUnicode_Check(unicode)) {
2819 PyErr_BadArgument();
2820 return NULL;
2821 }
2822 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 PyUnicode_GET_SIZE(unicode),
2824 NULL,
2825 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002826}
2827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828/* --- UTF-16 Codec ------------------------------------------------------- */
2829
Tim Peters772747b2001-08-09 22:21:55 +00002830PyObject *
2831PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_ssize_t size,
2833 const char *errors,
2834 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835{
Walter Dörwald69652032004-09-07 20:24:22 +00002836 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2837}
2838
Antoine Pitrouab868312009-01-10 15:40:25 +00002839/* Two masks for fast checking of whether a C 'long' may contain
2840 UTF16-encoded surrogate characters. This is an efficient heuristic,
2841 assuming that non-surrogate characters with a code point >= 0x8000 are
2842 rare in most input.
2843 FAST_CHAR_MASK is used when the input is in native byte ordering,
2844 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002845*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002846#if (SIZEOF_LONG == 8)
2847# define FAST_CHAR_MASK 0x8000800080008000L
2848# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2849#elif (SIZEOF_LONG == 4)
2850# define FAST_CHAR_MASK 0x80008000L
2851# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2852#else
2853# error C 'long' size should be either 4 or 8!
2854#endif
2855
Walter Dörwald69652032004-09-07 20:24:22 +00002856PyObject *
2857PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 Py_ssize_t size,
2859 const char *errors,
2860 int *byteorder,
2861 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t startinpos;
2865 Py_ssize_t endinpos;
2866 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 PyUnicodeObject *unicode;
2868 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002869 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002870 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002871 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002872 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002873 /* Offsets from q for retrieving byte pairs in the right order. */
2874#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2875 int ihi = 1, ilo = 0;
2876#else
2877 int ihi = 0, ilo = 1;
2878#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 PyObject *errorHandler = NULL;
2880 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881
2882 /* Note: size will always be longer than the resulting Unicode
2883 character count */
2884 unicode = _PyUnicode_New(size);
2885 if (!unicode)
2886 return NULL;
2887 if (size == 0)
2888 return (PyObject *)unicode;
2889
2890 /* Unpack UTF-16 encoded data */
2891 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002892 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002893 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894
2895 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002896 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002898 /* Check for BOM marks (U+FEFF) in the input and adjust current
2899 byte order setting accordingly. In native mode, the leading BOM
2900 mark is skipped, in all other modes, it is copied to the output
2901 stream as-is (giving a ZWNBSP character). */
2902 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002903 if (size >= 2) {
2904 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002905#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 if (bom == 0xFEFF) {
2907 q += 2;
2908 bo = -1;
2909 }
2910 else if (bom == 0xFFFE) {
2911 q += 2;
2912 bo = 1;
2913 }
Tim Petersced69f82003-09-16 20:30:58 +00002914#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 if (bom == 0xFEFF) {
2916 q += 2;
2917 bo = 1;
2918 }
2919 else if (bom == 0xFFFE) {
2920 q += 2;
2921 bo = -1;
2922 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002923#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926
Tim Peters772747b2001-08-09 22:21:55 +00002927 if (bo == -1) {
2928 /* force LE */
2929 ihi = 1;
2930 ilo = 0;
2931 }
2932 else if (bo == 1) {
2933 /* force BE */
2934 ihi = 0;
2935 ilo = 1;
2936 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002937#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2938 native_ordering = ilo < ihi;
2939#else
2940 native_ordering = ilo > ihi;
2941#endif
Tim Peters772747b2001-08-09 22:21:55 +00002942
Antoine Pitrouab868312009-01-10 15:40:25 +00002943 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002944 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002946 /* First check for possible aligned read of a C 'long'. Unaligned
2947 reads are more expensive, better to defer to another iteration. */
2948 if (!((size_t) q & LONG_PTR_MASK)) {
2949 /* Fast path for runs of non-surrogate chars. */
2950 register const unsigned char *_q = q;
2951 Py_UNICODE *_p = p;
2952 if (native_ordering) {
2953 /* Native ordering is simple: as long as the input cannot
2954 possibly contain a surrogate char, do an unrolled copy
2955 of several 16-bit code points to the target object.
2956 The non-surrogate check is done on several input bytes
2957 at a time (as many as a C 'long' can contain). */
2958 while (_q < aligned_end) {
2959 unsigned long data = * (unsigned long *) _q;
2960 if (data & FAST_CHAR_MASK)
2961 break;
2962 _p[0] = ((unsigned short *) _q)[0];
2963 _p[1] = ((unsigned short *) _q)[1];
2964#if (SIZEOF_LONG == 8)
2965 _p[2] = ((unsigned short *) _q)[2];
2966 _p[3] = ((unsigned short *) _q)[3];
2967#endif
2968 _q += SIZEOF_LONG;
2969 _p += SIZEOF_LONG / 2;
2970 }
2971 }
2972 else {
2973 /* Byteswapped ordering is similar, but we must decompose
2974 the copy bytewise, and take care of zero'ing out the
2975 upper bytes if the target object is in 32-bit units
2976 (that is, in UCS-4 builds). */
2977 while (_q < aligned_end) {
2978 unsigned long data = * (unsigned long *) _q;
2979 if (data & SWAPPED_FAST_CHAR_MASK)
2980 break;
2981 /* Zero upper bytes in UCS-4 builds */
2982#if (Py_UNICODE_SIZE > 2)
2983 _p[0] = 0;
2984 _p[1] = 0;
2985#if (SIZEOF_LONG == 8)
2986 _p[2] = 0;
2987 _p[3] = 0;
2988#endif
2989#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002990 /* Issue #4916; UCS-4 builds on big endian machines must
2991 fill the two last bytes of each 4-byte unit. */
2992#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2993# define OFF 2
2994#else
2995# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002996#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002997 ((unsigned char *) _p)[OFF + 1] = _q[0];
2998 ((unsigned char *) _p)[OFF + 0] = _q[1];
2999 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3000 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3001#if (SIZEOF_LONG == 8)
3002 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3003 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3004 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3005 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3006#endif
3007#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003008 _q += SIZEOF_LONG;
3009 _p += SIZEOF_LONG / 2;
3010 }
3011 }
3012 p = _p;
3013 q = _q;
3014 if (q >= e)
3015 break;
3016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018
Benjamin Peterson14339b62009-01-31 16:36:08 +00003019 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003020
3021 if (ch < 0xD800 || ch > 0xDFFF) {
3022 *p++ = ch;
3023 continue;
3024 }
3025
3026 /* UTF-16 code pair: */
3027 if (q > e) {
3028 errmsg = "unexpected end of data";
3029 startinpos = (((const char *)q) - 2) - starts;
3030 endinpos = ((const char *)e) + 1 - starts;
3031 goto utf16Error;
3032 }
3033 if (0xD800 <= ch && ch <= 0xDBFF) {
3034 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3035 q += 2;
3036 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003037#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 *p++ = ch;
3039 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003040#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003042#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 continue;
3044 }
3045 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 startinpos = (((const char *)q)-4)-starts;
3048 endinpos = startinpos+2;
3049 goto utf16Error;
3050 }
3051
Benjamin Peterson14339b62009-01-31 16:36:08 +00003052 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 errmsg = "illegal encoding";
3054 startinpos = (((const char *)q)-2)-starts;
3055 endinpos = startinpos+2;
3056 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 utf16Error:
3059 outpos = p - PyUnicode_AS_UNICODE(unicode);
3060 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003061 errors,
3062 &errorHandler,
3063 "utf16", errmsg,
3064 &starts,
3065 (const char **)&e,
3066 &startinpos,
3067 &endinpos,
3068 &exc,
3069 (const char **)&q,
3070 &unicode,
3071 &outpos,
3072 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003075 /* remaining byte at the end? (size should be even) */
3076 if (e == q) {
3077 if (!consumed) {
3078 errmsg = "truncated data";
3079 startinpos = ((const char *)q) - starts;
3080 endinpos = ((const char *)e) + 1 - starts;
3081 outpos = p - PyUnicode_AS_UNICODE(unicode);
3082 if (unicode_decode_call_errorhandler(
3083 errors,
3084 &errorHandler,
3085 "utf16", errmsg,
3086 &starts,
3087 (const char **)&e,
3088 &startinpos,
3089 &endinpos,
3090 &exc,
3091 (const char **)&q,
3092 &unicode,
3093 &outpos,
3094 &p))
3095 goto onError;
3096 /* The remaining input chars are ignored if the callback
3097 chooses to skip the input */
3098 }
3099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100
3101 if (byteorder)
3102 *byteorder = bo;
3103
Walter Dörwald69652032004-09-07 20:24:22 +00003104 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003108 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 goto onError;
3110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 Py_XDECREF(errorHandler);
3112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 return (PyObject *)unicode;
3114
Benjamin Peterson29060642009-01-31 22:14:21 +00003115 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 Py_XDECREF(errorHandler);
3118 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 return NULL;
3120}
3121
Antoine Pitrouab868312009-01-10 15:40:25 +00003122#undef FAST_CHAR_MASK
3123#undef SWAPPED_FAST_CHAR_MASK
3124
Tim Peters772747b2001-08-09 22:21:55 +00003125PyObject *
3126PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 Py_ssize_t size,
3128 const char *errors,
3129 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003131 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003132 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003133 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003134#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003135 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003136#else
3137 const int pairs = 0;
3138#endif
Tim Peters772747b2001-08-09 22:21:55 +00003139 /* Offsets from p for storing byte pairs in the right order. */
3140#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3141 int ihi = 1, ilo = 0;
3142#else
3143 int ihi = 0, ilo = 1;
3144#endif
3145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146#define STORECHAR(CH) \
3147 do { \
3148 p[ihi] = ((CH) >> 8) & 0xff; \
3149 p[ilo] = (CH) & 0xff; \
3150 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003151 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003153#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003154 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 if (s[i] >= 0x10000)
3156 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003157#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003158 /* 2 * (size + pairs + (byteorder == 0)) */
3159 if (size > PY_SSIZE_T_MAX ||
3160 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003162 nsize = size + pairs + (byteorder == 0);
3163 bytesize = nsize * 2;
3164 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003166 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (v == NULL)
3168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003170 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003173 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003174 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003175
3176 if (byteorder == -1) {
3177 /* force LE */
3178 ihi = 1;
3179 ilo = 0;
3180 }
3181 else if (byteorder == 1) {
3182 /* force BE */
3183 ihi = 0;
3184 ilo = 1;
3185 }
3186
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003187 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 Py_UNICODE ch = *s++;
3189 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003190#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 if (ch >= 0x10000) {
3192 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3193 ch = 0xD800 | ((ch-0x10000) >> 10);
3194 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003195#endif
Tim Peters772747b2001-08-09 22:21:55 +00003196 STORECHAR(ch);
3197 if (ch2)
3198 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003199 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003200
3201 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003202 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003203#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
3206PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3207{
3208 if (!PyUnicode_Check(unicode)) {
3209 PyErr_BadArgument();
3210 return NULL;
3211 }
3212 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 PyUnicode_GET_SIZE(unicode),
3214 NULL,
3215 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216}
3217
3218/* --- Unicode Escape Codec ----------------------------------------------- */
3219
Fredrik Lundh06d12682001-01-24 07:59:11 +00003220static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 Py_ssize_t size,
3224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003227 Py_ssize_t startinpos;
3228 Py_ssize_t endinpos;
3229 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003234 char* message;
3235 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 PyObject *errorHandler = NULL;
3237 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 /* Escaped strings will always be longer than the resulting
3240 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 length after conversion to the true value.
3242 (but if the error callback returns a long replacement string
3243 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 v = _PyUnicode_New(size);
3245 if (v == NULL)
3246 goto onError;
3247 if (size == 0)
3248 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003252
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 while (s < end) {
3254 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003255 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257
3258 /* Non-escape characters are interpreted as Unicode ordinals */
3259 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003260 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 continue;
3262 }
3263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 /* \ - Escapes */
3266 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003267 c = *s++;
3268 if (s > end)
3269 c = '\0'; /* Invalid after \ */
3270 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 case '\n': break;
3274 case '\\': *p++ = '\\'; break;
3275 case '\'': *p++ = '\''; break;
3276 case '\"': *p++ = '\"'; break;
3277 case 'b': *p++ = '\b'; break;
3278 case 'f': *p++ = '\014'; break; /* FF */
3279 case 't': *p++ = '\t'; break;
3280 case 'n': *p++ = '\n'; break;
3281 case 'r': *p++ = '\r'; break;
3282 case 'v': *p++ = '\013'; break; /* VT */
3283 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 case '0': case '1': case '2': case '3':
3287 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003288 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003289 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003290 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003291 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003292 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003294 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 break;
3296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 /* hex escapes */
3298 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003300 digits = 2;
3301 message = "truncated \\xXX escape";
3302 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003306 digits = 4;
3307 message = "truncated \\uXXXX escape";
3308 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003311 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003312 digits = 8;
3313 message = "truncated \\UXXXXXXXX escape";
3314 hexescape:
3315 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 outpos = p-PyUnicode_AS_UNICODE(v);
3317 if (s+digits>end) {
3318 endinpos = size;
3319 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003320 errors, &errorHandler,
3321 "unicodeescape", "end of string in escape sequence",
3322 &starts, &end, &startinpos, &endinpos, &exc, &s,
3323 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 goto onError;
3325 goto nextByte;
3326 }
3327 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003328 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003329 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 endinpos = (s+i+1)-starts;
3331 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 errors, &errorHandler,
3333 "unicodeescape", message,
3334 &starts, &end, &startinpos, &endinpos, &exc, &s,
3335 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003338 }
3339 chr = (chr<<4) & ~0xF;
3340 if (c >= '0' && c <= '9')
3341 chr += c - '0';
3342 else if (c >= 'a' && c <= 'f')
3343 chr += 10 + c - 'a';
3344 else
3345 chr += 10 + c - 'A';
3346 }
3347 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003348 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 /* _decoding_error will have already written into the
3350 target buffer. */
3351 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003352 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003353 /* when we get here, chr is a 32-bit unicode character */
3354 if (chr <= 0xffff)
3355 /* UCS-2 character */
3356 *p++ = (Py_UNICODE) chr;
3357 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003358 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003359 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003360#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003361 *p++ = chr;
3362#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003363 chr -= 0x10000L;
3364 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003365 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003366#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003367 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 endinpos = s-starts;
3369 outpos = p-PyUnicode_AS_UNICODE(v);
3370 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 errors, &errorHandler,
3372 "unicodeescape", "illegal Unicode character",
3373 &starts, &end, &startinpos, &endinpos, &exc, &s,
3374 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003375 goto onError;
3376 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003377 break;
3378
Benjamin Peterson29060642009-01-31 22:14:21 +00003379 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003380 case 'N':
3381 message = "malformed \\N character escape";
3382 if (ucnhash_CAPI == NULL) {
3383 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003384 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003385 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003386 if (m == NULL)
3387 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003388 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003389 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003390 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003391 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003392 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003393 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003394 if (ucnhash_CAPI == NULL)
3395 goto ucnhashError;
3396 }
3397 if (*s == '{') {
3398 const char *start = s+1;
3399 /* look for the closing brace */
3400 while (*s != '}' && s < end)
3401 s++;
3402 if (s > start && s < end && *s == '}') {
3403 /* found a name. look it up in the unicode database */
3404 message = "unknown Unicode character name";
3405 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003406 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003407 goto store;
3408 }
3409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 endinpos = s-starts;
3411 outpos = p-PyUnicode_AS_UNICODE(v);
3412 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 errors, &errorHandler,
3414 "unicodeescape", message,
3415 &starts, &end, &startinpos, &endinpos, &exc, &s,
3416 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003417 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003418 break;
3419
3420 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003421 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 message = "\\ at end of string";
3423 s--;
3424 endinpos = s-starts;
3425 outpos = p-PyUnicode_AS_UNICODE(v);
3426 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 errors, &errorHandler,
3428 "unicodeescape", message,
3429 &starts, &end, &startinpos, &endinpos, &exc, &s,
3430 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003431 goto onError;
3432 }
3433 else {
3434 *p++ = '\\';
3435 *p++ = (unsigned char)s[-1];
3436 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003437 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003439 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003442 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003444 Py_XDECREF(errorHandler);
3445 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003447
Benjamin Peterson29060642009-01-31 22:14:21 +00003448 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003449 PyErr_SetString(
3450 PyExc_UnicodeError,
3451 "\\N escapes not supported (can't load unicodedata module)"
3452 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003453 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 Py_XDECREF(errorHandler);
3455 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003456 return NULL;
3457
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 Py_XDECREF(errorHandler);
3461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 return NULL;
3463}
3464
3465/* Return a Unicode-Escape string version of the Unicode object.
3466
3467 If quotes is true, the string is enclosed in u"" or u'' quotes as
3468 appropriate.
3469
3470*/
3471
Thomas Wouters477c8d52006-05-27 19:21:47 +00003472Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 Py_ssize_t size,
3474 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003475{
3476 /* like wcschr, but doesn't stop at NULL characters */
3477
3478 while (size-- > 0) {
3479 if (*s == ch)
3480 return s;
3481 s++;
3482 }
3483
3484 return NULL;
3485}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003486
Walter Dörwald79e913e2007-05-12 11:08:06 +00003487static const char *hexdigits = "0123456789abcdef";
3488
3489PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003492 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003495#ifdef Py_UNICODE_WIDE
3496 const Py_ssize_t expandsize = 10;
3497#else
3498 const Py_ssize_t expandsize = 6;
3499#endif
3500
Thomas Wouters89f507f2006-12-13 04:49:30 +00003501 /* XXX(nnorwitz): rather than over-allocating, it would be
3502 better to choose a different scheme. Perhaps scan the
3503 first N-chars of the string and allocate based on that size.
3504 */
3505 /* Initial allocation is based on the longest-possible unichr
3506 escape.
3507
3508 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3509 unichr, so in this case it's the longest unichr escape. In
3510 narrow (UTF-16) builds this is five chars per source unichr
3511 since there are two unichrs in the surrogate pair, so in narrow
3512 (UTF-16) builds it's not the longest unichr escape.
3513
3514 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3515 so in the narrow (UTF-16) build case it's the longest unichr
3516 escape.
3517 */
3518
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003519 if (size == 0)
3520 return PyBytes_FromStringAndSize(NULL, 0);
3521
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003522 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003524
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003525 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 2
3527 + expandsize*size
3528 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 if (repr == NULL)
3530 return NULL;
3531
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003532 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 while (size-- > 0) {
3535 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003536
Walter Dörwald79e913e2007-05-12 11:08:06 +00003537 /* Escape backslashes */
3538 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 *p++ = '\\';
3540 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003541 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003542 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003543
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003544#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003545 /* Map 21-bit characters to '\U00xxxxxx' */
3546 else if (ch >= 0x10000) {
3547 *p++ = '\\';
3548 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003549 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3550 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3551 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3552 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3553 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3554 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3555 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3556 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003558 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003559#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3561 else if (ch >= 0xD800 && ch < 0xDC00) {
3562 Py_UNICODE ch2;
3563 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003564
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 ch2 = *s++;
3566 size--;
3567 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3568 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3569 *p++ = '\\';
3570 *p++ = 'U';
3571 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3572 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3573 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3574 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3575 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3576 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3577 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3578 *p++ = hexdigits[ucs & 0x0000000F];
3579 continue;
3580 }
3581 /* Fall through: isolated surrogates are copied as-is */
3582 s--;
3583 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003584 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003585#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003588 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 *p++ = '\\';
3590 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003591 *p++ = hexdigits[(ch >> 12) & 0x000F];
3592 *p++ = hexdigits[(ch >> 8) & 0x000F];
3593 *p++ = hexdigits[(ch >> 4) & 0x000F];
3594 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003596
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003597 /* Map special whitespace to '\t', \n', '\r' */
3598 else if (ch == '\t') {
3599 *p++ = '\\';
3600 *p++ = 't';
3601 }
3602 else if (ch == '\n') {
3603 *p++ = '\\';
3604 *p++ = 'n';
3605 }
3606 else if (ch == '\r') {
3607 *p++ = '\\';
3608 *p++ = 'r';
3609 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003610
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003611 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003612 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003614 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003615 *p++ = hexdigits[(ch >> 4) & 0x000F];
3616 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003617 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003618
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 /* Copy everything else as-is */
3620 else
3621 *p++ = (char) ch;
3622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003624 assert(p - PyBytes_AS_STRING(repr) > 0);
3625 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3626 return NULL;
3627 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628}
3629
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003630PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003632 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 if (!PyUnicode_Check(unicode)) {
3634 PyErr_BadArgument();
3635 return NULL;
3636 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003637 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3638 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003639 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640}
3641
3642/* --- Raw Unicode Escape Codec ------------------------------------------- */
3643
3644PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 Py_ssize_t size,
3646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003649 Py_ssize_t startinpos;
3650 Py_ssize_t endinpos;
3651 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 const char *end;
3655 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 PyObject *errorHandler = NULL;
3657 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 /* Escaped strings will always be longer than the resulting
3660 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 length after conversion to the true value. (But decoding error
3662 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 v = _PyUnicode_New(size);
3664 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 end = s + size;
3670 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 unsigned char c;
3672 Py_UCS4 x;
3673 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003674 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 /* Non-escape characters are interpreted as Unicode ordinals */
3677 if (*s != '\\') {
3678 *p++ = (unsigned char)*s++;
3679 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003680 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 startinpos = s-starts;
3682
3683 /* \u-escapes are only interpreted iff the number of leading
3684 backslashes if odd */
3685 bs = s;
3686 for (;s < end;) {
3687 if (*s != '\\')
3688 break;
3689 *p++ = (unsigned char)*s++;
3690 }
3691 if (((s - bs) & 1) == 0 ||
3692 s >= end ||
3693 (*s != 'u' && *s != 'U')) {
3694 continue;
3695 }
3696 p--;
3697 count = *s=='u' ? 4 : 8;
3698 s++;
3699
3700 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3701 outpos = p-PyUnicode_AS_UNICODE(v);
3702 for (x = 0, i = 0; i < count; ++i, ++s) {
3703 c = (unsigned char)*s;
3704 if (!ISXDIGIT(c)) {
3705 endinpos = s-starts;
3706 if (unicode_decode_call_errorhandler(
3707 errors, &errorHandler,
3708 "rawunicodeescape", "truncated \\uXXXX",
3709 &starts, &end, &startinpos, &endinpos, &exc, &s,
3710 &v, &outpos, &p))
3711 goto onError;
3712 goto nextByte;
3713 }
3714 x = (x<<4) & ~0xF;
3715 if (c >= '0' && c <= '9')
3716 x += c - '0';
3717 else if (c >= 'a' && c <= 'f')
3718 x += 10 + c - 'a';
3719 else
3720 x += 10 + c - 'A';
3721 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003722 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 /* UCS-2 character */
3724 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003725 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 /* UCS-4 character. Either store directly, or as
3727 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003728#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003730#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 x -= 0x10000L;
3732 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3733 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003734#endif
3735 } else {
3736 endinpos = s-starts;
3737 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003738 if (unicode_decode_call_errorhandler(
3739 errors, &errorHandler,
3740 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 &starts, &end, &startinpos, &endinpos, &exc, &s,
3742 &v, &outpos, &p))
3743 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 nextByte:
3746 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003748 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 Py_XDECREF(errorHandler);
3751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003753
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 Py_XDECREF(errorHandler);
3757 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 return NULL;
3759}
3760
3761PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003764 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 char *p;
3766 char *q;
3767
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003768#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003769 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003770#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003771 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003772#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003773
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003774 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003776
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003777 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (repr == NULL)
3779 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003780 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003781 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003783 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 while (size-- > 0) {
3785 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003786#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 /* Map 32-bit characters to '\Uxxxxxxxx' */
3788 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003789 *p++ = '\\';
3790 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003791 *p++ = hexdigits[(ch >> 28) & 0xf];
3792 *p++ = hexdigits[(ch >> 24) & 0xf];
3793 *p++ = hexdigits[(ch >> 20) & 0xf];
3794 *p++ = hexdigits[(ch >> 16) & 0xf];
3795 *p++ = hexdigits[(ch >> 12) & 0xf];
3796 *p++ = hexdigits[(ch >> 8) & 0xf];
3797 *p++ = hexdigits[(ch >> 4) & 0xf];
3798 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003799 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003800 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003801#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003802 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3803 if (ch >= 0xD800 && ch < 0xDC00) {
3804 Py_UNICODE ch2;
3805 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003806
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 ch2 = *s++;
3808 size--;
3809 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3810 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3811 *p++ = '\\';
3812 *p++ = 'U';
3813 *p++ = hexdigits[(ucs >> 28) & 0xf];
3814 *p++ = hexdigits[(ucs >> 24) & 0xf];
3815 *p++ = hexdigits[(ucs >> 20) & 0xf];
3816 *p++ = hexdigits[(ucs >> 16) & 0xf];
3817 *p++ = hexdigits[(ucs >> 12) & 0xf];
3818 *p++ = hexdigits[(ucs >> 8) & 0xf];
3819 *p++ = hexdigits[(ucs >> 4) & 0xf];
3820 *p++ = hexdigits[ucs & 0xf];
3821 continue;
3822 }
3823 /* Fall through: isolated surrogates are copied as-is */
3824 s--;
3825 size++;
3826 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003827#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 /* Map 16-bit characters to '\uxxxx' */
3829 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 *p++ = '\\';
3831 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003832 *p++ = hexdigits[(ch >> 12) & 0xf];
3833 *p++ = hexdigits[(ch >> 8) & 0xf];
3834 *p++ = hexdigits[(ch >> 4) & 0xf];
3835 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 /* Copy everything else as-is */
3838 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 *p++ = (char) ch;
3840 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003841 size = p - q;
3842
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003843 assert(size > 0);
3844 if (_PyBytes_Resize(&repr, size) < 0)
3845 return NULL;
3846 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847}
3848
3849PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3850{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003851 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003853 PyErr_BadArgument();
3854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003856 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3857 PyUnicode_GET_SIZE(unicode));
3858
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003859 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860}
3861
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003862/* --- Unicode Internal Codec ------------------------------------------- */
3863
3864PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 Py_ssize_t size,
3866 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003867{
3868 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003869 Py_ssize_t startinpos;
3870 Py_ssize_t endinpos;
3871 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003872 PyUnicodeObject *v;
3873 Py_UNICODE *p;
3874 const char *end;
3875 const char *reason;
3876 PyObject *errorHandler = NULL;
3877 PyObject *exc = NULL;
3878
Neal Norwitzd43069c2006-01-08 01:12:10 +00003879#ifdef Py_UNICODE_WIDE
3880 Py_UNICODE unimax = PyUnicode_GetMax();
3881#endif
3882
Thomas Wouters89f507f2006-12-13 04:49:30 +00003883 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003884 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3885 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003887 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003888 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003889 p = PyUnicode_AS_UNICODE(v);
3890 end = s + size;
3891
3892 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003893 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003894 /* We have to sanity check the raw data, otherwise doom looms for
3895 some malformed UCS-4 data. */
3896 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003897#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003898 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003899#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003900 end-s < Py_UNICODE_SIZE
3901 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003903 startinpos = s - starts;
3904 if (end-s < Py_UNICODE_SIZE) {
3905 endinpos = end-starts;
3906 reason = "truncated input";
3907 }
3908 else {
3909 endinpos = s - starts + Py_UNICODE_SIZE;
3910 reason = "illegal code point (> 0x10FFFF)";
3911 }
3912 outpos = p - PyUnicode_AS_UNICODE(v);
3913 if (unicode_decode_call_errorhandler(
3914 errors, &errorHandler,
3915 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003916 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003917 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003918 goto onError;
3919 }
3920 }
3921 else {
3922 p++;
3923 s += Py_UNICODE_SIZE;
3924 }
3925 }
3926
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003927 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003928 goto onError;
3929 Py_XDECREF(errorHandler);
3930 Py_XDECREF(exc);
3931 return (PyObject *)v;
3932
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003934 Py_XDECREF(v);
3935 Py_XDECREF(errorHandler);
3936 Py_XDECREF(exc);
3937 return NULL;
3938}
3939
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940/* --- Latin-1 Codec ------------------------------------------------------ */
3941
3942PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_ssize_t size,
3944 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945{
3946 PyUnicodeObject *v;
3947 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003948 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003949
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003951 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 Py_UNICODE r = *(unsigned char*)s;
3953 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003954 }
3955
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 v = _PyUnicode_New(size);
3957 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003962 e = s + size;
3963 /* Unrolling the copy makes it much faster by reducing the looping
3964 overhead. This is similar to what many memcpy() implementations do. */
3965 unrolled_end = e - 4;
3966 while (s < unrolled_end) {
3967 p[0] = (unsigned char) s[0];
3968 p[1] = (unsigned char) s[1];
3969 p[2] = (unsigned char) s[2];
3970 p[3] = (unsigned char) s[3];
3971 s += 4;
3972 p += 4;
3973 }
3974 while (s < e)
3975 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003977
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 Py_XDECREF(v);
3980 return NULL;
3981}
3982
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983/* create or adjust a UnicodeEncodeError */
3984static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 const char *encoding,
3986 const Py_UNICODE *unicode, Py_ssize_t size,
3987 Py_ssize_t startpos, Py_ssize_t endpos,
3988 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 *exceptionObject = PyUnicodeEncodeError_Create(
3992 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 }
3994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3996 goto onError;
3997 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3998 goto onError;
3999 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4000 goto onError;
4001 return;
4002 onError:
4003 Py_DECREF(*exceptionObject);
4004 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 }
4006}
4007
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008/* raises a UnicodeEncodeError */
4009static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 const char *encoding,
4011 const Py_UNICODE *unicode, Py_ssize_t size,
4012 Py_ssize_t startpos, Py_ssize_t endpos,
4013 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014{
4015 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019}
4020
4021/* error handling callback helper:
4022 build arguments, call the callback and check the arguments,
4023 put the result into newpos and return the replacement string, which
4024 has to be freed by the caller */
4025static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 PyObject **errorHandler,
4027 const char *encoding, const char *reason,
4028 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4029 Py_ssize_t startpos, Py_ssize_t endpos,
4030 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004032 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033
4034 PyObject *restuple;
4035 PyObject *resunicode;
4036
4037 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 }
4042
4043 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047
4048 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004053 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 Py_DECREF(restuple);
4055 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004057 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 &resunicode, newpos)) {
4059 Py_DECREF(restuple);
4060 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004062 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4063 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4064 Py_DECREF(restuple);
4065 return NULL;
4066 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004069 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4071 Py_DECREF(restuple);
4072 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 Py_INCREF(resunicode);
4075 Py_DECREF(restuple);
4076 return resunicode;
4077}
4078
4079static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 Py_ssize_t size,
4081 const char *errors,
4082 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083{
4084 /* output object */
4085 PyObject *res;
4086 /* pointers to the beginning and end+1 of input */
4087 const Py_UNICODE *startp = p;
4088 const Py_UNICODE *endp = p + size;
4089 /* pointer to the beginning of the unencodable characters */
4090 /* const Py_UNICODE *badp = NULL; */
4091 /* pointer into the output */
4092 char *str;
4093 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004094 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004095 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4096 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 PyObject *errorHandler = NULL;
4098 PyObject *exc = NULL;
4099 /* the following variable is used for caching string comparisons
4100 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4101 int known_errorHandler = -1;
4102
4103 /* allocate enough for a simple encoding without
4104 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004105 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004106 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004107 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004109 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004110 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 ressize = size;
4112
4113 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 /* can we encode this? */
4117 if (c<limit) {
4118 /* no overflow check, because we know that the space is enough */
4119 *str++ = (char)c;
4120 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 else {
4123 Py_ssize_t unicodepos = p-startp;
4124 Py_ssize_t requiredsize;
4125 PyObject *repunicode;
4126 Py_ssize_t repsize;
4127 Py_ssize_t newpos;
4128 Py_ssize_t respos;
4129 Py_UNICODE *uni2;
4130 /* startpos for collecting unencodable chars */
4131 const Py_UNICODE *collstart = p;
4132 const Py_UNICODE *collend = p;
4133 /* find all unecodable characters */
4134 while ((collend < endp) && ((*collend)>=limit))
4135 ++collend;
4136 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4137 if (known_errorHandler==-1) {
4138 if ((errors==NULL) || (!strcmp(errors, "strict")))
4139 known_errorHandler = 1;
4140 else if (!strcmp(errors, "replace"))
4141 known_errorHandler = 2;
4142 else if (!strcmp(errors, "ignore"))
4143 known_errorHandler = 3;
4144 else if (!strcmp(errors, "xmlcharrefreplace"))
4145 known_errorHandler = 4;
4146 else
4147 known_errorHandler = 0;
4148 }
4149 switch (known_errorHandler) {
4150 case 1: /* strict */
4151 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4152 goto onError;
4153 case 2: /* replace */
4154 while (collstart++<collend)
4155 *str++ = '?'; /* fall through */
4156 case 3: /* ignore */
4157 p = collend;
4158 break;
4159 case 4: /* xmlcharrefreplace */
4160 respos = str - PyBytes_AS_STRING(res);
4161 /* determine replacement size (temporarily (mis)uses p) */
4162 for (p = collstart, repsize = 0; p < collend; ++p) {
4163 if (*p<10)
4164 repsize += 2+1+1;
4165 else if (*p<100)
4166 repsize += 2+2+1;
4167 else if (*p<1000)
4168 repsize += 2+3+1;
4169 else if (*p<10000)
4170 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004171#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 else
4173 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004174#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 else if (*p<100000)
4176 repsize += 2+5+1;
4177 else if (*p<1000000)
4178 repsize += 2+6+1;
4179 else
4180 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004181#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 }
4183 requiredsize = respos+repsize+(endp-collend);
4184 if (requiredsize > ressize) {
4185 if (requiredsize<2*ressize)
4186 requiredsize = 2*ressize;
4187 if (_PyBytes_Resize(&res, requiredsize))
4188 goto onError;
4189 str = PyBytes_AS_STRING(res) + respos;
4190 ressize = requiredsize;
4191 }
4192 /* generate replacement (temporarily (mis)uses p) */
4193 for (p = collstart; p < collend; ++p) {
4194 str += sprintf(str, "&#%d;", (int)*p);
4195 }
4196 p = collend;
4197 break;
4198 default:
4199 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4200 encoding, reason, startp, size, &exc,
4201 collstart-startp, collend-startp, &newpos);
4202 if (repunicode == NULL)
4203 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004204 if (PyBytes_Check(repunicode)) {
4205 /* Directly copy bytes result to output. */
4206 repsize = PyBytes_Size(repunicode);
4207 if (repsize > 1) {
4208 /* Make room for all additional bytes. */
4209 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4210 Py_DECREF(repunicode);
4211 goto onError;
4212 }
4213 ressize += repsize-1;
4214 }
4215 memcpy(str, PyBytes_AsString(repunicode), repsize);
4216 str += repsize;
4217 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004218 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004219 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 /* need more space? (at least enough for what we
4222 have+the replacement+the rest of the string, so
4223 we won't have to check space for encodable characters) */
4224 respos = str - PyBytes_AS_STRING(res);
4225 repsize = PyUnicode_GET_SIZE(repunicode);
4226 requiredsize = respos+repsize+(endp-collend);
4227 if (requiredsize > ressize) {
4228 if (requiredsize<2*ressize)
4229 requiredsize = 2*ressize;
4230 if (_PyBytes_Resize(&res, requiredsize)) {
4231 Py_DECREF(repunicode);
4232 goto onError;
4233 }
4234 str = PyBytes_AS_STRING(res) + respos;
4235 ressize = requiredsize;
4236 }
4237 /* check if there is anything unencodable in the replacement
4238 and copy it to the output */
4239 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4240 c = *uni2;
4241 if (c >= limit) {
4242 raise_encode_exception(&exc, encoding, startp, size,
4243 unicodepos, unicodepos+1, reason);
4244 Py_DECREF(repunicode);
4245 goto onError;
4246 }
4247 *str = (char)c;
4248 }
4249 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004250 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004251 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004252 }
4253 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004254 /* Resize if we allocated to much */
4255 size = str - PyBytes_AS_STRING(res);
4256 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004257 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004258 if (_PyBytes_Resize(&res, size) < 0)
4259 goto onError;
4260 }
4261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 Py_XDECREF(errorHandler);
4263 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004264 return res;
4265
4266 onError:
4267 Py_XDECREF(res);
4268 Py_XDECREF(errorHandler);
4269 Py_XDECREF(exc);
4270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271}
4272
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 Py_ssize_t size,
4275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278}
4279
4280PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4281{
4282 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 PyErr_BadArgument();
4284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 }
4286 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 PyUnicode_GET_SIZE(unicode),
4288 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289}
4290
4291/* --- 7-bit ASCII Codec -------------------------------------------------- */
4292
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_ssize_t size,
4295 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 PyUnicodeObject *v;
4299 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 Py_ssize_t startinpos;
4301 Py_ssize_t endinpos;
4302 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *e;
4304 PyObject *errorHandler = NULL;
4305 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004306
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004308 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 Py_UNICODE r = *(unsigned char*)s;
4310 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004311 }
Tim Petersced69f82003-09-16 20:30:58 +00004312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 v = _PyUnicode_New(size);
4314 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 e = s + size;
4320 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 register unsigned char c = (unsigned char)*s;
4322 if (c < 128) {
4323 *p++ = c;
4324 ++s;
4325 }
4326 else {
4327 startinpos = s-starts;
4328 endinpos = startinpos + 1;
4329 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4330 if (unicode_decode_call_errorhandler(
4331 errors, &errorHandler,
4332 "ascii", "ordinal not in range(128)",
4333 &starts, &e, &startinpos, &endinpos, &exc, &s,
4334 &v, &outpos, &p))
4335 goto onError;
4336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004338 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4340 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 Py_XDECREF(errorHandler);
4342 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004344
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 Py_XDECREF(errorHandler);
4348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 return NULL;
4350}
4351
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 Py_ssize_t size,
4354 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357}
4358
4359PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4360{
4361 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 PyErr_BadArgument();
4363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 }
4365 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 PyUnicode_GET_SIZE(unicode),
4367 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368}
4369
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004370#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004371
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004372/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004373
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004374#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004375#define NEED_RETRY
4376#endif
4377
4378/* XXX This code is limited to "true" double-byte encodings, as
4379 a) it assumes an incomplete character consists of a single byte, and
4380 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004382
4383static int is_dbcs_lead_byte(const char *s, int offset)
4384{
4385 const char *curr = s + offset;
4386
4387 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 const char *prev = CharPrev(s, curr);
4389 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004390 }
4391 return 0;
4392}
4393
4394/*
4395 * Decode MBCS string into unicode object. If 'final' is set, converts
4396 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4397 */
4398static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 const char *s, /* MBCS string */
4400 int size, /* sizeof MBCS string */
4401 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004402{
4403 Py_UNICODE *p;
4404 Py_ssize_t n = 0;
4405 int usize = 0;
4406
4407 assert(size >= 0);
4408
4409 /* Skip trailing lead-byte unless 'final' is set */
4410 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004412
4413 /* First get the size of the result */
4414 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4416 if (usize == 0) {
4417 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4418 return -1;
4419 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004420 }
4421
4422 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 /* Create unicode object */
4424 *v = _PyUnicode_New(usize);
4425 if (*v == NULL)
4426 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004427 }
4428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 /* Extend unicode object */
4430 n = PyUnicode_GET_SIZE(*v);
4431 if (_PyUnicode_Resize(v, n + usize) < 0)
4432 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004433 }
4434
4435 /* Do the conversion */
4436 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 p = PyUnicode_AS_UNICODE(*v) + n;
4438 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4439 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4440 return -1;
4441 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004442 }
4443
4444 return size;
4445}
4446
4447PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 Py_ssize_t size,
4449 const char *errors,
4450 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004451{
4452 PyUnicodeObject *v = NULL;
4453 int done;
4454
4455 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004457
4458#ifdef NEED_RETRY
4459 retry:
4460 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004462 else
4463#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004465
4466 if (done < 0) {
4467 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004469 }
4470
4471 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004473
4474#ifdef NEED_RETRY
4475 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 s += done;
4477 size -= done;
4478 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004479 }
4480#endif
4481
4482 return (PyObject *)v;
4483}
4484
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004485PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 Py_ssize_t size,
4487 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004488{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004489 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4490}
4491
4492/*
4493 * Convert unicode into string object (MBCS).
4494 * Returns 0 if succeed, -1 otherwise.
4495 */
4496static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 const Py_UNICODE *p, /* unicode */
4498 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004499{
4500 int mbcssize = 0;
4501 Py_ssize_t n = 0;
4502
4503 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004504
4505 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004506 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4508 if (mbcssize == 0) {
4509 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4510 return -1;
4511 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004512 }
4513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004514 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 /* Create string object */
4516 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4517 if (*repr == NULL)
4518 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004519 }
4520 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 /* Extend string object */
4522 n = PyBytes_Size(*repr);
4523 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4524 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004525 }
4526
4527 /* Do the conversion */
4528 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 char *s = PyBytes_AS_STRING(*repr) + n;
4530 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4531 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4532 return -1;
4533 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004534 }
4535
4536 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004537}
4538
4539PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 Py_ssize_t size,
4541 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004542{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004543 PyObject *repr = NULL;
4544 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004545
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004548 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004550 else
4551#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004553
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004554 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 Py_XDECREF(repr);
4556 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004557 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004558
4559#ifdef NEED_RETRY
4560 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 p += INT_MAX;
4562 size -= INT_MAX;
4563 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004564 }
4565#endif
4566
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004567 return repr;
4568}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004569
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004570PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4571{
4572 if (!PyUnicode_Check(unicode)) {
4573 PyErr_BadArgument();
4574 return NULL;
4575 }
4576 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 PyUnicode_GET_SIZE(unicode),
4578 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004579}
4580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581#undef NEED_RETRY
4582
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004583#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004584
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585/* --- Character Mapping Codec -------------------------------------------- */
4586
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 Py_ssize_t size,
4589 PyObject *mapping,
4590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t startinpos;
4594 Py_ssize_t endinpos;
4595 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 PyUnicodeObject *v;
4598 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004599 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 PyObject *errorHandler = NULL;
4601 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004602 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004603 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 /* Default to Latin-1 */
4606 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608
4609 v = _PyUnicode_New(size);
4610 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004616 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 mapstring = PyUnicode_AS_UNICODE(mapping);
4618 maplen = PyUnicode_GET_SIZE(mapping);
4619 while (s < e) {
4620 unsigned char ch = *s;
4621 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 if (ch < maplen)
4624 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 if (x == 0xfffe) {
4627 /* undefined mapping */
4628 outpos = p-PyUnicode_AS_UNICODE(v);
4629 startinpos = s-starts;
4630 endinpos = startinpos+1;
4631 if (unicode_decode_call_errorhandler(
4632 errors, &errorHandler,
4633 "charmap", "character maps to <undefined>",
4634 &starts, &e, &startinpos, &endinpos, &exc, &s,
4635 &v, &outpos, &p)) {
4636 goto onError;
4637 }
4638 continue;
4639 }
4640 *p++ = x;
4641 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004642 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004643 }
4644 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 while (s < e) {
4646 unsigned char ch = *s;
4647 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004648
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4650 w = PyLong_FromLong((long)ch);
4651 if (w == NULL)
4652 goto onError;
4653 x = PyObject_GetItem(mapping, w);
4654 Py_DECREF(w);
4655 if (x == NULL) {
4656 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4657 /* No mapping found means: mapping is undefined. */
4658 PyErr_Clear();
4659 x = Py_None;
4660 Py_INCREF(x);
4661 } else
4662 goto onError;
4663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004664
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 /* Apply mapping */
4666 if (PyLong_Check(x)) {
4667 long value = PyLong_AS_LONG(x);
4668 if (value < 0 || value > 65535) {
4669 PyErr_SetString(PyExc_TypeError,
4670 "character mapping must be in range(65536)");
4671 Py_DECREF(x);
4672 goto onError;
4673 }
4674 *p++ = (Py_UNICODE)value;
4675 }
4676 else if (x == Py_None) {
4677 /* undefined mapping */
4678 outpos = p-PyUnicode_AS_UNICODE(v);
4679 startinpos = s-starts;
4680 endinpos = startinpos+1;
4681 if (unicode_decode_call_errorhandler(
4682 errors, &errorHandler,
4683 "charmap", "character maps to <undefined>",
4684 &starts, &e, &startinpos, &endinpos, &exc, &s,
4685 &v, &outpos, &p)) {
4686 Py_DECREF(x);
4687 goto onError;
4688 }
4689 Py_DECREF(x);
4690 continue;
4691 }
4692 else if (PyUnicode_Check(x)) {
4693 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004694
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 if (targetsize == 1)
4696 /* 1-1 mapping */
4697 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004698
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 else if (targetsize > 1) {
4700 /* 1-n mapping */
4701 if (targetsize > extrachars) {
4702 /* resize first */
4703 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4704 Py_ssize_t needed = (targetsize - extrachars) + \
4705 (targetsize << 2);
4706 extrachars += needed;
4707 /* XXX overflow detection missing */
4708 if (_PyUnicode_Resize(&v,
4709 PyUnicode_GET_SIZE(v) + needed) < 0) {
4710 Py_DECREF(x);
4711 goto onError;
4712 }
4713 p = PyUnicode_AS_UNICODE(v) + oldpos;
4714 }
4715 Py_UNICODE_COPY(p,
4716 PyUnicode_AS_UNICODE(x),
4717 targetsize);
4718 p += targetsize;
4719 extrachars -= targetsize;
4720 }
4721 /* 1-0 mapping: skip the character */
4722 }
4723 else {
4724 /* wrong return value */
4725 PyErr_SetString(PyExc_TypeError,
4726 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004727 Py_DECREF(x);
4728 goto onError;
4729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 Py_DECREF(x);
4731 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 }
4734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4736 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_XDECREF(errorHandler);
4738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004740
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 Py_XDECREF(errorHandler);
4743 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_XDECREF(v);
4745 return NULL;
4746}
4747
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004748/* Charmap encoding: the lookup table */
4749
4750struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 PyObject_HEAD
4752 unsigned char level1[32];
4753 int count2, count3;
4754 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004755};
4756
4757static PyObject*
4758encoding_map_size(PyObject *obj, PyObject* args)
4759{
4760 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004761 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004763}
4764
4765static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004766 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 PyDoc_STR("Return the size (in bytes) of this object") },
4768 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004769};
4770
4771static void
4772encoding_map_dealloc(PyObject* o)
4773{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004774 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004775}
4776
4777static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004778 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 "EncodingMap", /*tp_name*/
4780 sizeof(struct encoding_map), /*tp_basicsize*/
4781 0, /*tp_itemsize*/
4782 /* methods */
4783 encoding_map_dealloc, /*tp_dealloc*/
4784 0, /*tp_print*/
4785 0, /*tp_getattr*/
4786 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004787 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 0, /*tp_repr*/
4789 0, /*tp_as_number*/
4790 0, /*tp_as_sequence*/
4791 0, /*tp_as_mapping*/
4792 0, /*tp_hash*/
4793 0, /*tp_call*/
4794 0, /*tp_str*/
4795 0, /*tp_getattro*/
4796 0, /*tp_setattro*/
4797 0, /*tp_as_buffer*/
4798 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4799 0, /*tp_doc*/
4800 0, /*tp_traverse*/
4801 0, /*tp_clear*/
4802 0, /*tp_richcompare*/
4803 0, /*tp_weaklistoffset*/
4804 0, /*tp_iter*/
4805 0, /*tp_iternext*/
4806 encoding_map_methods, /*tp_methods*/
4807 0, /*tp_members*/
4808 0, /*tp_getset*/
4809 0, /*tp_base*/
4810 0, /*tp_dict*/
4811 0, /*tp_descr_get*/
4812 0, /*tp_descr_set*/
4813 0, /*tp_dictoffset*/
4814 0, /*tp_init*/
4815 0, /*tp_alloc*/
4816 0, /*tp_new*/
4817 0, /*tp_free*/
4818 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004819};
4820
4821PyObject*
4822PyUnicode_BuildEncodingMap(PyObject* string)
4823{
4824 Py_UNICODE *decode;
4825 PyObject *result;
4826 struct encoding_map *mresult;
4827 int i;
4828 int need_dict = 0;
4829 unsigned char level1[32];
4830 unsigned char level2[512];
4831 unsigned char *mlevel1, *mlevel2, *mlevel3;
4832 int count2 = 0, count3 = 0;
4833
4834 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4835 PyErr_BadArgument();
4836 return NULL;
4837 }
4838 decode = PyUnicode_AS_UNICODE(string);
4839 memset(level1, 0xFF, sizeof level1);
4840 memset(level2, 0xFF, sizeof level2);
4841
4842 /* If there isn't a one-to-one mapping of NULL to \0,
4843 or if there are non-BMP characters, we need to use
4844 a mapping dictionary. */
4845 if (decode[0] != 0)
4846 need_dict = 1;
4847 for (i = 1; i < 256; i++) {
4848 int l1, l2;
4849 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004850#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004851 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004852#endif
4853 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004854 need_dict = 1;
4855 break;
4856 }
4857 if (decode[i] == 0xFFFE)
4858 /* unmapped character */
4859 continue;
4860 l1 = decode[i] >> 11;
4861 l2 = decode[i] >> 7;
4862 if (level1[l1] == 0xFF)
4863 level1[l1] = count2++;
4864 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004865 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004866 }
4867
4868 if (count2 >= 0xFF || count3 >= 0xFF)
4869 need_dict = 1;
4870
4871 if (need_dict) {
4872 PyObject *result = PyDict_New();
4873 PyObject *key, *value;
4874 if (!result)
4875 return NULL;
4876 for (i = 0; i < 256; i++) {
4877 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004878 key = PyLong_FromLong(decode[i]);
4879 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004880 if (!key || !value)
4881 goto failed1;
4882 if (PyDict_SetItem(result, key, value) == -1)
4883 goto failed1;
4884 Py_DECREF(key);
4885 Py_DECREF(value);
4886 }
4887 return result;
4888 failed1:
4889 Py_XDECREF(key);
4890 Py_XDECREF(value);
4891 Py_DECREF(result);
4892 return NULL;
4893 }
4894
4895 /* Create a three-level trie */
4896 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4897 16*count2 + 128*count3 - 1);
4898 if (!result)
4899 return PyErr_NoMemory();
4900 PyObject_Init(result, &EncodingMapType);
4901 mresult = (struct encoding_map*)result;
4902 mresult->count2 = count2;
4903 mresult->count3 = count3;
4904 mlevel1 = mresult->level1;
4905 mlevel2 = mresult->level23;
4906 mlevel3 = mresult->level23 + 16*count2;
4907 memcpy(mlevel1, level1, 32);
4908 memset(mlevel2, 0xFF, 16*count2);
4909 memset(mlevel3, 0, 128*count3);
4910 count3 = 0;
4911 for (i = 1; i < 256; i++) {
4912 int o1, o2, o3, i2, i3;
4913 if (decode[i] == 0xFFFE)
4914 /* unmapped character */
4915 continue;
4916 o1 = decode[i]>>11;
4917 o2 = (decode[i]>>7) & 0xF;
4918 i2 = 16*mlevel1[o1] + o2;
4919 if (mlevel2[i2] == 0xFF)
4920 mlevel2[i2] = count3++;
4921 o3 = decode[i] & 0x7F;
4922 i3 = 128*mlevel2[i2] + o3;
4923 mlevel3[i3] = i;
4924 }
4925 return result;
4926}
4927
4928static int
4929encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4930{
4931 struct encoding_map *map = (struct encoding_map*)mapping;
4932 int l1 = c>>11;
4933 int l2 = (c>>7) & 0xF;
4934 int l3 = c & 0x7F;
4935 int i;
4936
4937#ifdef Py_UNICODE_WIDE
4938 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004940 }
4941#endif
4942 if (c == 0)
4943 return 0;
4944 /* level 1*/
4945 i = map->level1[l1];
4946 if (i == 0xFF) {
4947 return -1;
4948 }
4949 /* level 2*/
4950 i = map->level23[16*i+l2];
4951 if (i == 0xFF) {
4952 return -1;
4953 }
4954 /* level 3 */
4955 i = map->level23[16*map->count2 + 128*i + l3];
4956 if (i == 0) {
4957 return -1;
4958 }
4959 return i;
4960}
4961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962/* Lookup the character ch in the mapping. If the character
4963 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004964 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966{
Christian Heimes217cfd12007-12-02 14:31:20 +00004967 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968 PyObject *x;
4969
4970 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 x = PyObject_GetItem(mapping, w);
4973 Py_DECREF(w);
4974 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4976 /* No mapping found means: mapping is undefined. */
4977 PyErr_Clear();
4978 x = Py_None;
4979 Py_INCREF(x);
4980 return x;
4981 } else
4982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004984 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004986 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 long value = PyLong_AS_LONG(x);
4988 if (value < 0 || value > 255) {
4989 PyErr_SetString(PyExc_TypeError,
4990 "character mapping must be in range(256)");
4991 Py_DECREF(x);
4992 return NULL;
4993 }
4994 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004996 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 /* wrong return value */
5000 PyErr_Format(PyExc_TypeError,
5001 "character mapping must return integer, bytes or None, not %.400s",
5002 x->ob_type->tp_name);
5003 Py_DECREF(x);
5004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 }
5006}
5007
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005008static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005009charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005010{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005011 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5012 /* exponentially overallocate to minimize reallocations */
5013 if (requiredsize < 2*outsize)
5014 requiredsize = 2*outsize;
5015 if (_PyBytes_Resize(outobj, requiredsize))
5016 return -1;
5017 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005018}
5019
Benjamin Peterson14339b62009-01-31 16:36:08 +00005020typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005022}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005024 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 space is available. Return a new reference to the object that
5026 was put in the output buffer, or Py_None, if the mapping was undefined
5027 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005028 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005030charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005033 PyObject *rep;
5034 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005035 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036
Christian Heimes90aa7642007-12-19 02:45:37 +00005037 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005038 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005040 if (res == -1)
5041 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 if (outsize<requiredsize)
5043 if (charmapencode_resize(outobj, outpos, requiredsize))
5044 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005045 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 outstart[(*outpos)++] = (char)res;
5047 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005048 }
5049
5050 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005053 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 Py_DECREF(rep);
5055 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005056 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 if (PyLong_Check(rep)) {
5058 Py_ssize_t requiredsize = *outpos+1;
5059 if (outsize<requiredsize)
5060 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5061 Py_DECREF(rep);
5062 return enc_EXCEPTION;
5063 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005064 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 else {
5068 const char *repchars = PyBytes_AS_STRING(rep);
5069 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5070 Py_ssize_t requiredsize = *outpos+repsize;
5071 if (outsize<requiredsize)
5072 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5073 Py_DECREF(rep);
5074 return enc_EXCEPTION;
5075 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005076 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 memcpy(outstart + *outpos, repchars, repsize);
5078 *outpos += repsize;
5079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005081 Py_DECREF(rep);
5082 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083}
5084
5085/* handle an error in PyUnicode_EncodeCharmap
5086 Return 0 on success, -1 on error */
5087static
5088int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005089 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005091 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005092 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093{
5094 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005095 Py_ssize_t repsize;
5096 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 Py_UNICODE *uni2;
5098 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005099 Py_ssize_t collstartpos = *inpos;
5100 Py_ssize_t collendpos = *inpos+1;
5101 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 char *encoding = "charmap";
5103 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005104 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 /* find all unencodable characters */
5107 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005108 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005109 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 int res = encoding_map_lookup(p[collendpos], mapping);
5111 if (res != -1)
5112 break;
5113 ++collendpos;
5114 continue;
5115 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005116
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 rep = charmapencode_lookup(p[collendpos], mapping);
5118 if (rep==NULL)
5119 return -1;
5120 else if (rep!=Py_None) {
5121 Py_DECREF(rep);
5122 break;
5123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005124 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 }
5127 /* cache callback name lookup
5128 * (if not done yet, i.e. it's the first error) */
5129 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 if ((errors==NULL) || (!strcmp(errors, "strict")))
5131 *known_errorHandler = 1;
5132 else if (!strcmp(errors, "replace"))
5133 *known_errorHandler = 2;
5134 else if (!strcmp(errors, "ignore"))
5135 *known_errorHandler = 3;
5136 else if (!strcmp(errors, "xmlcharrefreplace"))
5137 *known_errorHandler = 4;
5138 else
5139 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 }
5141 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005142 case 1: /* strict */
5143 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5144 return -1;
5145 case 2: /* replace */
5146 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 x = charmapencode_output('?', mapping, res, respos);
5148 if (x==enc_EXCEPTION) {
5149 return -1;
5150 }
5151 else if (x==enc_FAILED) {
5152 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5153 return -1;
5154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005155 }
5156 /* fall through */
5157 case 3: /* ignore */
5158 *inpos = collendpos;
5159 break;
5160 case 4: /* xmlcharrefreplace */
5161 /* generate replacement (temporarily (mis)uses p) */
5162 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 char buffer[2+29+1+1];
5164 char *cp;
5165 sprintf(buffer, "&#%d;", (int)p[collpos]);
5166 for (cp = buffer; *cp; ++cp) {
5167 x = charmapencode_output(*cp, mapping, res, respos);
5168 if (x==enc_EXCEPTION)
5169 return -1;
5170 else if (x==enc_FAILED) {
5171 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5172 return -1;
5173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005174 }
5175 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005176 *inpos = collendpos;
5177 break;
5178 default:
5179 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 encoding, reason, p, size, exceptionObject,
5181 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005184 if (PyBytes_Check(repunicode)) {
5185 /* Directly copy bytes result to output. */
5186 Py_ssize_t outsize = PyBytes_Size(*res);
5187 Py_ssize_t requiredsize;
5188 repsize = PyBytes_Size(repunicode);
5189 requiredsize = *respos + repsize;
5190 if (requiredsize > outsize)
5191 /* Make room for all additional bytes. */
5192 if (charmapencode_resize(res, respos, requiredsize)) {
5193 Py_DECREF(repunicode);
5194 return -1;
5195 }
5196 memcpy(PyBytes_AsString(*res) + *respos,
5197 PyBytes_AsString(repunicode), repsize);
5198 *respos += repsize;
5199 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005200 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005201 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005202 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005203 /* generate replacement */
5204 repsize = PyUnicode_GET_SIZE(repunicode);
5205 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 x = charmapencode_output(*uni2, mapping, res, respos);
5207 if (x==enc_EXCEPTION) {
5208 return -1;
5209 }
5210 else if (x==enc_FAILED) {
5211 Py_DECREF(repunicode);
5212 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5213 return -1;
5214 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005215 }
5216 *inpos = newpos;
5217 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 }
5219 return 0;
5220}
5221
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 Py_ssize_t size,
5224 PyObject *mapping,
5225 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 /* output object */
5228 PyObject *res = NULL;
5229 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
5235 /* the following variable is used for caching string comparisons
5236 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5237 * 3=ignore, 4=xmlcharrefreplace */
5238 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
5240 /* Default to Latin-1 */
5241 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 /* allocate enough for a simple encoding without
5245 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005246 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 if (res == NULL)
5248 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005249 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 /* try to encode it */
5254 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5255 if (x==enc_EXCEPTION) /* error */
5256 goto onError;
5257 if (x==enc_FAILED) { /* unencodable character */
5258 if (charmap_encoding_error(p, size, &inpos, mapping,
5259 &exc,
5260 &known_errorHandler, &errorHandler, errors,
5261 &res, &respos)) {
5262 goto onError;
5263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005264 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 else
5266 /* done with this character => adjust input position */
5267 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005271 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005272 if (_PyBytes_Resize(&res, respos) < 0)
5273 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 Py_XDECREF(exc);
5276 Py_XDECREF(errorHandler);
5277 return res;
5278
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 Py_XDECREF(res);
5281 Py_XDECREF(exc);
5282 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 return NULL;
5284}
5285
5286PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
5289 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 PyErr_BadArgument();
5291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 }
5293 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 PyUnicode_GET_SIZE(unicode),
5295 mapping,
5296 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297}
5298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005299/* create or adjust a UnicodeTranslateError */
5300static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 const Py_UNICODE *unicode, Py_ssize_t size,
5302 Py_ssize_t startpos, Py_ssize_t endpos,
5303 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005306 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 }
5309 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5311 goto onError;
5312 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5313 goto onError;
5314 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5315 goto onError;
5316 return;
5317 onError:
5318 Py_DECREF(*exceptionObject);
5319 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 }
5321}
5322
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005323/* raises a UnicodeTranslateError */
5324static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 const Py_UNICODE *unicode, Py_ssize_t size,
5326 Py_ssize_t startpos, Py_ssize_t endpos,
5327 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005328{
5329 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333}
5334
5335/* error handling callback helper:
5336 build arguments, call the callback and check the arguments,
5337 put the result into newpos and return the replacement string, which
5338 has to be freed by the caller */
5339static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 PyObject **errorHandler,
5341 const char *reason,
5342 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5343 Py_ssize_t startpos, Py_ssize_t endpos,
5344 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005346 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005348 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 PyObject *restuple;
5350 PyObject *resunicode;
5351
5352 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 }
5357
5358 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362
5363 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005368 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 Py_DECREF(restuple);
5370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 }
5372 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 &resunicode, &i_newpos)) {
5374 Py_DECREF(restuple);
5375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 else
5380 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005381 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5383 Py_DECREF(restuple);
5384 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 Py_INCREF(resunicode);
5387 Py_DECREF(restuple);
5388 return resunicode;
5389}
5390
5391/* Lookup the character ch in the mapping and put the result in result,
5392 which must be decrefed by the caller.
5393 Return 0 on success, -1 on error */
5394static
5395int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5396{
Christian Heimes217cfd12007-12-02 14:31:20 +00005397 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 PyObject *x;
5399
5400 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402 x = PyObject_GetItem(mapping, w);
5403 Py_DECREF(w);
5404 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5406 /* No mapping found means: use 1:1 mapping. */
5407 PyErr_Clear();
5408 *result = NULL;
5409 return 0;
5410 } else
5411 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412 }
5413 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 *result = x;
5415 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005417 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 long value = PyLong_AS_LONG(x);
5419 long max = PyUnicode_GetMax();
5420 if (value < 0 || value > max) {
5421 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005422 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 Py_DECREF(x);
5424 return -1;
5425 }
5426 *result = x;
5427 return 0;
5428 }
5429 else if (PyUnicode_Check(x)) {
5430 *result = x;
5431 return 0;
5432 }
5433 else {
5434 /* wrong return value */
5435 PyErr_SetString(PyExc_TypeError,
5436 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005437 Py_DECREF(x);
5438 return -1;
5439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440}
5441/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 if not reallocate and adjust various state variables.
5443 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444static
Walter Dörwald4894c302003-10-24 14:25:28 +00005445int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005448 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005449 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 /* remember old output position */
5451 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5452 /* exponentially overallocate to minimize reallocations */
5453 if (requiredsize < 2 * oldsize)
5454 requiredsize = 2 * oldsize;
5455 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5456 return -1;
5457 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 }
5459 return 0;
5460}
5461/* lookup the character, put the result in the output string and adjust
5462 various state variables. Return a new reference to the object that
5463 was put in the output buffer in *result, or Py_None, if the mapping was
5464 undefined (in which case no character was written).
5465 The called must decref result.
5466 Return 0 on success, -1 on error. */
5467static
Walter Dörwald4894c302003-10-24 14:25:28 +00005468int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5470 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471{
Walter Dörwald4894c302003-10-24 14:25:28 +00005472 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 /* not found => default to 1:1 mapping */
5476 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005477 }
5478 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005480 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 /* no overflow check, because we know that the space is enough */
5482 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 }
5484 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5486 if (repsize==1) {
5487 /* no overflow check, because we know that the space is enough */
5488 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5489 }
5490 else if (repsize!=0) {
5491 /* more than one character */
5492 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5493 (insize - (curinp-startinp)) +
5494 repsize - 1;
5495 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5496 return -1;
5497 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5498 *outp += repsize;
5499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500 }
5501 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 return 0;
5504}
5505
5506PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 Py_ssize_t size,
5508 PyObject *mapping,
5509 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 /* output object */
5512 PyObject *res = NULL;
5513 /* pointers to the beginning and end+1 of input */
5514 const Py_UNICODE *startp = p;
5515 const Py_UNICODE *endp = p + size;
5516 /* pointer into the output */
5517 Py_UNICODE *str;
5518 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 char *reason = "character maps to <undefined>";
5521 PyObject *errorHandler = NULL;
5522 PyObject *exc = NULL;
5523 /* the following variable is used for caching string comparisons
5524 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5525 * 3=ignore, 4=xmlcharrefreplace */
5526 int known_errorHandler = -1;
5527
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 PyErr_BadArgument();
5530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532
5533 /* allocate enough for a simple 1:1 translation without
5534 replacements, if we need more, we'll resize */
5535 res = PyUnicode_FromUnicode(NULL, size);
5536 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 /* try to encode it */
5544 PyObject *x = NULL;
5545 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5546 Py_XDECREF(x);
5547 goto onError;
5548 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005549 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 if (x!=Py_None) /* it worked => adjust input pointer */
5551 ++p;
5552 else { /* untranslatable character */
5553 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5554 Py_ssize_t repsize;
5555 Py_ssize_t newpos;
5556 Py_UNICODE *uni2;
5557 /* startpos for collecting untranslatable chars */
5558 const Py_UNICODE *collstart = p;
5559 const Py_UNICODE *collend = p+1;
5560 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 /* find all untranslatable characters */
5563 while (collend < endp) {
5564 if (charmaptranslate_lookup(*collend, mapping, &x))
5565 goto onError;
5566 Py_XDECREF(x);
5567 if (x!=Py_None)
5568 break;
5569 ++collend;
5570 }
5571 /* cache callback name lookup
5572 * (if not done yet, i.e. it's the first error) */
5573 if (known_errorHandler==-1) {
5574 if ((errors==NULL) || (!strcmp(errors, "strict")))
5575 known_errorHandler = 1;
5576 else if (!strcmp(errors, "replace"))
5577 known_errorHandler = 2;
5578 else if (!strcmp(errors, "ignore"))
5579 known_errorHandler = 3;
5580 else if (!strcmp(errors, "xmlcharrefreplace"))
5581 known_errorHandler = 4;
5582 else
5583 known_errorHandler = 0;
5584 }
5585 switch (known_errorHandler) {
5586 case 1: /* strict */
5587 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005588 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 case 2: /* replace */
5590 /* No need to check for space, this is a 1:1 replacement */
5591 for (coll = collstart; coll<collend; ++coll)
5592 *str++ = '?';
5593 /* fall through */
5594 case 3: /* ignore */
5595 p = collend;
5596 break;
5597 case 4: /* xmlcharrefreplace */
5598 /* generate replacement (temporarily (mis)uses p) */
5599 for (p = collstart; p < collend; ++p) {
5600 char buffer[2+29+1+1];
5601 char *cp;
5602 sprintf(buffer, "&#%d;", (int)*p);
5603 if (charmaptranslate_makespace(&res, &str,
5604 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5605 goto onError;
5606 for (cp = buffer; *cp; ++cp)
5607 *str++ = *cp;
5608 }
5609 p = collend;
5610 break;
5611 default:
5612 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5613 reason, startp, size, &exc,
5614 collstart-startp, collend-startp, &newpos);
5615 if (repunicode == NULL)
5616 goto onError;
5617 /* generate replacement */
5618 repsize = PyUnicode_GET_SIZE(repunicode);
5619 if (charmaptranslate_makespace(&res, &str,
5620 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5621 Py_DECREF(repunicode);
5622 goto onError;
5623 }
5624 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5625 *str++ = *uni2;
5626 p = startp + newpos;
5627 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005629 }
5630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631 /* Resize if we allocated to much */
5632 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005633 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 if (PyUnicode_Resize(&res, respos) < 0)
5635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 }
5637 Py_XDECREF(exc);
5638 Py_XDECREF(errorHandler);
5639 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 Py_XDECREF(res);
5643 Py_XDECREF(exc);
5644 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 return NULL;
5646}
5647
5648PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 PyObject *mapping,
5650 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651{
5652 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 str = PyUnicode_FromObject(str);
5655 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 PyUnicode_GET_SIZE(str),
5659 mapping,
5660 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 Py_DECREF(str);
5662 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005663
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 Py_XDECREF(str);
5666 return NULL;
5667}
Tim Petersced69f82003-09-16 20:30:58 +00005668
Guido van Rossum9e896b32000-04-05 20:11:21 +00005669/* --- Decimal Encoder ---------------------------------------------------- */
5670
5671int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 Py_ssize_t length,
5673 char *output,
5674 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005675{
5676 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 PyObject *errorHandler = NULL;
5678 PyObject *exc = NULL;
5679 const char *encoding = "decimal";
5680 const char *reason = "invalid decimal Unicode string";
5681 /* the following variable is used for caching string comparisons
5682 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5683 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005684
5685 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 PyErr_BadArgument();
5687 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005688 }
5689
5690 p = s;
5691 end = s + length;
5692 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 register Py_UNICODE ch = *p;
5694 int decimal;
5695 PyObject *repunicode;
5696 Py_ssize_t repsize;
5697 Py_ssize_t newpos;
5698 Py_UNICODE *uni2;
5699 Py_UNICODE *collstart;
5700 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005701
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005703 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 ++p;
5705 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 decimal = Py_UNICODE_TODECIMAL(ch);
5708 if (decimal >= 0) {
5709 *output++ = '0' + decimal;
5710 ++p;
5711 continue;
5712 }
5713 if (0 < ch && ch < 256) {
5714 *output++ = (char)ch;
5715 ++p;
5716 continue;
5717 }
5718 /* All other characters are considered unencodable */
5719 collstart = p;
5720 collend = p+1;
5721 while (collend < end) {
5722 if ((0 < *collend && *collend < 256) ||
5723 !Py_UNICODE_ISSPACE(*collend) ||
5724 Py_UNICODE_TODECIMAL(*collend))
5725 break;
5726 }
5727 /* cache callback name lookup
5728 * (if not done yet, i.e. it's the first error) */
5729 if (known_errorHandler==-1) {
5730 if ((errors==NULL) || (!strcmp(errors, "strict")))
5731 known_errorHandler = 1;
5732 else if (!strcmp(errors, "replace"))
5733 known_errorHandler = 2;
5734 else if (!strcmp(errors, "ignore"))
5735 known_errorHandler = 3;
5736 else if (!strcmp(errors, "xmlcharrefreplace"))
5737 known_errorHandler = 4;
5738 else
5739 known_errorHandler = 0;
5740 }
5741 switch (known_errorHandler) {
5742 case 1: /* strict */
5743 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5744 goto onError;
5745 case 2: /* replace */
5746 for (p = collstart; p < collend; ++p)
5747 *output++ = '?';
5748 /* fall through */
5749 case 3: /* ignore */
5750 p = collend;
5751 break;
5752 case 4: /* xmlcharrefreplace */
5753 /* generate replacement (temporarily (mis)uses p) */
5754 for (p = collstart; p < collend; ++p)
5755 output += sprintf(output, "&#%d;", (int)*p);
5756 p = collend;
5757 break;
5758 default:
5759 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5760 encoding, reason, s, length, &exc,
5761 collstart-s, collend-s, &newpos);
5762 if (repunicode == NULL)
5763 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005764 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005765 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005766 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5767 Py_DECREF(repunicode);
5768 goto onError;
5769 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 /* generate replacement */
5771 repsize = PyUnicode_GET_SIZE(repunicode);
5772 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5773 Py_UNICODE ch = *uni2;
5774 if (Py_UNICODE_ISSPACE(ch))
5775 *output++ = ' ';
5776 else {
5777 decimal = Py_UNICODE_TODECIMAL(ch);
5778 if (decimal >= 0)
5779 *output++ = '0' + decimal;
5780 else if (0 < ch && ch < 256)
5781 *output++ = (char)ch;
5782 else {
5783 Py_DECREF(repunicode);
5784 raise_encode_exception(&exc, encoding,
5785 s, length, collstart-s, collend-s, reason);
5786 goto onError;
5787 }
5788 }
5789 }
5790 p = s + newpos;
5791 Py_DECREF(repunicode);
5792 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005793 }
5794 /* 0-terminate the output string */
5795 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_XDECREF(exc);
5797 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005798 return 0;
5799
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 Py_XDECREF(exc);
5802 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005803 return -1;
5804}
5805
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806/* --- Helpers ------------------------------------------------------------ */
5807
Eric Smith8c663262007-08-25 02:26:07 +00005808#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005809#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005811/* Include _ParseTupleFinds from find.h */
5812#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813#include "stringlib/find.h"
5814#include "stringlib/partition.h"
5815
Eric Smith5807c412008-05-11 21:00:57 +00005816#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005817#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005818#include "stringlib/localeutil.h"
5819
Thomas Wouters477c8d52006-05-27 19:21:47 +00005820/* helper macro to fixup start/end slice values */
5821#define FIX_START_END(obj) \
5822 if (start < 0) \
5823 start += (obj)->length; \
5824 if (start < 0) \
5825 start = 0; \
5826 if (end > (obj)->length) \
5827 end = (obj)->length; \
5828 if (end < 0) \
5829 end += (obj)->length; \
5830 if (end < 0) \
5831 end = 0;
5832
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005834 PyObject *substr,
5835 Py_ssize_t start,
5836 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005838 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005839 PyUnicodeObject* str_obj;
5840 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005841
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5843 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005845 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5846 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 Py_DECREF(str_obj);
5848 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Tim Petersced69f82003-09-16 20:30:58 +00005850
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005852
Thomas Wouters477c8d52006-05-27 19:21:47 +00005853 result = stringlib_count(
5854 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5855 );
5856
5857 Py_DECREF(sub_obj);
5858 Py_DECREF(str_obj);
5859
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 return result;
5861}
5862
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005864 PyObject *sub,
5865 Py_ssize_t start,
5866 Py_ssize_t end,
5867 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005872 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 sub = PyUnicode_FromObject(sub);
5875 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 Py_DECREF(str);
5877 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 }
Tim Petersced69f82003-09-16 20:30:58 +00005879
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 if (direction > 0)
5881 result = stringlib_find_slice(
5882 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5883 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5884 start, end
5885 );
5886 else
5887 result = stringlib_rfind_slice(
5888 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5889 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5890 start, end
5891 );
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005894 Py_DECREF(sub);
5895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 return result;
5897}
5898
Tim Petersced69f82003-09-16 20:30:58 +00005899static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 PyUnicodeObject *substring,
5902 Py_ssize_t start,
5903 Py_ssize_t end,
5904 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (substring->length == 0)
5907 return 1;
5908
Thomas Wouters477c8d52006-05-27 19:21:47 +00005909 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
5911 end -= substring->length;
5912 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
5915 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 if (Py_UNICODE_MATCH(self, end, substring))
5917 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 } else {
5919 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 }
5922
5923 return 0;
5924}
5925
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyObject *substr,
5928 Py_ssize_t start,
5929 Py_ssize_t end,
5930 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005932 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005933
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 str = PyUnicode_FromObject(str);
5935 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 substr = PyUnicode_FromObject(substr);
5938 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 Py_DECREF(str);
5940 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
Tim Petersced69f82003-09-16 20:30:58 +00005942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 (PyUnicodeObject *)substr,
5945 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 Py_DECREF(str);
5947 Py_DECREF(substr);
5948 return result;
5949}
5950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951/* Apply fixfct filter to the Unicode object self and return a
5952 reference to the modified object */
5953
Tim Petersced69f82003-09-16 20:30:58 +00005954static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
5958
5959 PyUnicodeObject *u;
5960
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005961 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005964
5965 Py_UNICODE_COPY(u->str, self->str, self->length);
5966
Tim Peters7a29bd52001-09-12 03:03:31 +00005967 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 /* fixfct should return TRUE if it modified the buffer. If
5969 FALSE, return a reference to the original buffer instead
5970 (to save space, not time) */
5971 Py_INCREF(self);
5972 Py_DECREF(u);
5973 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
5975 return (PyObject*) u;
5976}
5977
Tim Petersced69f82003-09-16 20:30:58 +00005978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979int fixupper(PyUnicodeObject *self)
5980{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005981 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 Py_UNICODE *s = self->str;
5983 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 ch = Py_UNICODE_TOUPPER(*s);
5989 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 *s = ch;
5992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 s++;
5994 }
5995
5996 return status;
5997}
5998
Tim Petersced69f82003-09-16 20:30:58 +00005999static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000int fixlower(PyUnicodeObject *self)
6001{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006002 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 Py_UNICODE *s = self->str;
6004 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006005
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 ch = Py_UNICODE_TOLOWER(*s);
6010 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 *s = ch;
6013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 s++;
6015 }
6016
6017 return status;
6018}
6019
Tim Petersced69f82003-09-16 20:30:58 +00006020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021int fixswapcase(PyUnicodeObject *self)
6022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 Py_UNICODE *s = self->str;
6025 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 while (len-- > 0) {
6028 if (Py_UNICODE_ISUPPER(*s)) {
6029 *s = Py_UNICODE_TOLOWER(*s);
6030 status = 1;
6031 } else if (Py_UNICODE_ISLOWER(*s)) {
6032 *s = Py_UNICODE_TOUPPER(*s);
6033 status = 1;
6034 }
6035 s++;
6036 }
6037
6038 return status;
6039}
6040
Tim Petersced69f82003-09-16 20:30:58 +00006041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042int fixcapitalize(PyUnicodeObject *self)
6043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006045 Py_UNICODE *s = self->str;
6046 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006047
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006048 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006050 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 *s = Py_UNICODE_TOUPPER(*s);
6052 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006054 s++;
6055 while (--len > 0) {
6056 if (Py_UNICODE_ISUPPER(*s)) {
6057 *s = Py_UNICODE_TOLOWER(*s);
6058 status = 1;
6059 }
6060 s++;
6061 }
6062 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063}
6064
6065static
6066int fixtitle(PyUnicodeObject *self)
6067{
6068 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6069 register Py_UNICODE *e;
6070 int previous_is_cased;
6071
6072 /* Shortcut for single character strings */
6073 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6075 if (*p != ch) {
6076 *p = ch;
6077 return 1;
6078 }
6079 else
6080 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Tim Petersced69f82003-09-16 20:30:58 +00006082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 e = p + PyUnicode_GET_SIZE(self);
6084 previous_is_cased = 0;
6085 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 if (previous_is_cased)
6089 *p = Py_UNICODE_TOLOWER(ch);
6090 else
6091 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006092
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 if (Py_UNICODE_ISLOWER(ch) ||
6094 Py_UNICODE_ISUPPER(ch) ||
6095 Py_UNICODE_ISTITLE(ch))
6096 previous_is_cased = 1;
6097 else
6098 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
6100 return 1;
6101}
6102
Tim Peters8ce9f162004-08-27 01:49:32 +00006103PyObject *
6104PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Skip Montanaro6543b452004-09-16 03:28:13 +00006106 const Py_UNICODE blank = ' ';
6107 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006109 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006110 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6111 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006112 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6113 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006114 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006115 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Tim Peters05eba1f2004-08-27 21:32:02 +00006117 fseq = PySequence_Fast(seq, "");
6118 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006119 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006120 }
6121
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006122 /* NOTE: the following code can't call back into Python code,
6123 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006124 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006125
Tim Peters05eba1f2004-08-27 21:32:02 +00006126 seqlen = PySequence_Fast_GET_SIZE(fseq);
6127 /* If empty sequence, return u"". */
6128 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006129 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6130 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006131 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006132 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006133 /* If singleton sequence with an exact Unicode, return that. */
6134 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 item = items[0];
6136 if (PyUnicode_CheckExact(item)) {
6137 Py_INCREF(item);
6138 res = (PyUnicodeObject *)item;
6139 goto Done;
6140 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006141 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006142 else {
6143 /* Set up sep and seplen */
6144 if (separator == NULL) {
6145 sep = &blank;
6146 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006147 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006148 else {
6149 if (!PyUnicode_Check(separator)) {
6150 PyErr_Format(PyExc_TypeError,
6151 "separator: expected str instance,"
6152 " %.80s found",
6153 Py_TYPE(separator)->tp_name);
6154 goto onError;
6155 }
6156 sep = PyUnicode_AS_UNICODE(separator);
6157 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006158 }
6159 }
6160
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006161 /* There are at least two things to join, or else we have a subclass
6162 * of str in the sequence.
6163 * Do a pre-pass to figure out the total amount of space we'll
6164 * need (sz), and see whether all argument are strings.
6165 */
6166 sz = 0;
6167 for (i = 0; i < seqlen; i++) {
6168 const Py_ssize_t old_sz = sz;
6169 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 if (!PyUnicode_Check(item)) {
6171 PyErr_Format(PyExc_TypeError,
6172 "sequence item %zd: expected str instance,"
6173 " %.80s found",
6174 i, Py_TYPE(item)->tp_name);
6175 goto onError;
6176 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006177 sz += PyUnicode_GET_SIZE(item);
6178 if (i != 0)
6179 sz += seplen;
6180 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6181 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006183 goto onError;
6184 }
6185 }
Tim Petersced69f82003-09-16 20:30:58 +00006186
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006187 res = _PyUnicode_New(sz);
6188 if (res == NULL)
6189 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006190
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006191 /* Catenate everything. */
6192 res_p = PyUnicode_AS_UNICODE(res);
6193 for (i = 0; i < seqlen; ++i) {
6194 Py_ssize_t itemlen;
6195 item = items[i];
6196 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 /* Copy item, and maybe the separator. */
6198 if (i) {
6199 Py_UNICODE_COPY(res_p, sep, seplen);
6200 res_p += seplen;
6201 }
6202 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6203 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006204 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006205
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006207 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 return (PyObject *)res;
6209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006211 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006212 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return NULL;
6214}
6215
Tim Petersced69f82003-09-16 20:30:58 +00006216static
6217PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 Py_ssize_t left,
6219 Py_ssize_t right,
6220 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221{
6222 PyUnicodeObject *u;
6223
6224 if (left < 0)
6225 left = 0;
6226 if (right < 0)
6227 right = 0;
6228
Tim Peters7a29bd52001-09-12 03:03:31 +00006229 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 Py_INCREF(self);
6231 return self;
6232 }
6233
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006234 if (left > PY_SSIZE_T_MAX - self->length ||
6235 right > PY_SSIZE_T_MAX - (left + self->length)) {
6236 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6237 return NULL;
6238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 u = _PyUnicode_New(left + self->length + right);
6240 if (u) {
6241 if (left)
6242 Py_UNICODE_FILL(u->str, fill, left);
6243 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6244 if (right)
6245 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6246 }
6247
6248 return u;
6249}
6250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251#define SPLIT_APPEND(data, left, right) \
6252 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6253 if (!str) \
6254 goto onError; \
6255 if (PyList_Append(list, str)) { \
6256 Py_DECREF(str); \
6257 goto onError; \
6258 } \
6259 else \
6260 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
6262static
6263PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 PyObject *list,
6265 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006267 register Py_ssize_t i;
6268 register Py_ssize_t j;
6269 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006271 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272
6273 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006275 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006277 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6279 i++;
6280 if (j < i) {
6281 if (maxcount-- <= 0)
6282 break;
6283 SPLIT_APPEND(buf, j, i);
6284 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6285 i++;
6286 j = i;
6287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 }
6289 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 }
6292 return list;
6293
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 Py_DECREF(list);
6296 return NULL;
6297}
6298
6299PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 register Py_ssize_t i;
6303 register Py_ssize_t j;
6304 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 PyObject *list;
6306 PyObject *str;
6307 Py_UNICODE *data;
6308
6309 string = PyUnicode_FromObject(string);
6310 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 data = PyUnicode_AS_UNICODE(string);
6313 len = PyUnicode_GET_SIZE(string);
6314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 list = PyList_New(0);
6316 if (!list)
6317 goto onError;
6318
6319 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 /* Find a line and append it */
6323 while (i < len && !BLOOM_LINEBREAK(data[i]))
6324 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006327 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 if (i < len) {
6329 if (data[i] == '\r' && i + 1 < len &&
6330 data[i+1] == '\n')
6331 i += 2;
6332 else
6333 i++;
6334 if (keepends)
6335 eol = i;
6336 }
6337 SPLIT_APPEND(data, j, eol);
6338 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
6340 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 }
6343
6344 Py_DECREF(string);
6345 return list;
6346
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006348 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 Py_DECREF(string);
6350 return NULL;
6351}
6352
Tim Petersced69f82003-09-16 20:30:58 +00006353static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 PyObject *list,
6356 Py_UNICODE ch,
6357 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 register Py_ssize_t i;
6360 register Py_ssize_t j;
6361 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006363 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 if (buf[i] == ch) {
6367 if (maxcount-- <= 0)
6368 break;
6369 SPLIT_APPEND(buf, j, i);
6370 i = j = i + 1;
6371 } else
6372 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
6374 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 }
6377 return list;
6378
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 Py_DECREF(list);
6381 return NULL;
6382}
6383
Tim Petersced69f82003-09-16 20:30:58 +00006384static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 PyObject *list,
6387 PyUnicodeObject *substring,
6388 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 register Py_ssize_t i;
6391 register Py_ssize_t j;
6392 Py_ssize_t len = self->length;
6393 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 PyObject *str;
6395
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006396 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 if (Py_UNICODE_MATCH(self, i, substring)) {
6398 if (maxcount-- <= 0)
6399 break;
6400 SPLIT_APPEND(self->str, j, i);
6401 i = j = i + sublen;
6402 } else
6403 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 }
6405 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 }
6408 return list;
6409
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 Py_DECREF(list);
6412 return NULL;
6413}
6414
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006415static
6416PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 PyObject *list,
6418 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006420 register Py_ssize_t i;
6421 register Py_ssize_t j;
6422 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006423 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006424 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006425
6426 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006428 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006430 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6432 i--;
6433 if (j > i) {
6434 if (maxcount-- <= 0)
6435 break;
6436 SPLIT_APPEND(buf, i + 1, j + 1);
6437 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6438 i--;
6439 j = i;
6440 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006441 }
6442 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006444 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006445 if (PyList_Reverse(list) < 0)
6446 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006447 return list;
6448
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006450 Py_DECREF(list);
6451 return NULL;
6452}
6453
Benjamin Peterson14339b62009-01-31 16:36:08 +00006454static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006455PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 PyObject *list,
6457 Py_UNICODE ch,
6458 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006459{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460 register Py_ssize_t i;
6461 register Py_ssize_t j;
6462 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006463 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006464 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006465
6466 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 if (buf[i] == ch) {
6468 if (maxcount-- <= 0)
6469 break;
6470 SPLIT_APPEND(buf, i + 1, j + 1);
6471 j = i = i - 1;
6472 } else
6473 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006474 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006475 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 if (PyList_Reverse(list) < 0)
6479 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006480 return list;
6481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006483 Py_DECREF(list);
6484 return NULL;
6485}
6486
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006488PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 PyObject *list,
6490 PyUnicodeObject *substring,
6491 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 register Py_ssize_t i;
6494 register Py_ssize_t j;
6495 Py_ssize_t len = self->length;
6496 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006497 PyObject *str;
6498
6499 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (Py_UNICODE_MATCH(self, i, substring)) {
6501 if (maxcount-- <= 0)
6502 break;
6503 SPLIT_APPEND(self->str, i + sublen, j);
6504 j = i;
6505 i -= sublen;
6506 } else
6507 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006508 }
6509 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006511 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 if (PyList_Reverse(list) < 0)
6513 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006514 return list;
6515
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006517 Py_DECREF(list);
6518 return NULL;
6519}
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521#undef SPLIT_APPEND
6522
6523static
6524PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 PyUnicodeObject *substring,
6526 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
6528 PyObject *list;
6529
6530 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006531 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
6533 list = PyList_New(0);
6534 if (!list)
6535 return NULL;
6536
6537 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
6540 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542
6543 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 Py_DECREF(list);
6545 PyErr_SetString(PyExc_ValueError, "empty separator");
6546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 }
6548 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Tim Petersced69f82003-09-16 20:30:58 +00006552static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006553PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 PyUnicodeObject *substring,
6555 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006556{
6557 PyObject *list;
6558
6559 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006560 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006561
6562 list = PyList_New(0);
6563 if (!list)
6564 return NULL;
6565
6566 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006568
6569 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006571
6572 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 Py_DECREF(list);
6574 PyErr_SetString(PyExc_ValueError, "empty separator");
6575 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006576 }
6577 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006579}
6580
6581static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 PyUnicodeObject *str1,
6584 PyUnicodeObject *str2,
6585 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586{
6587 PyUnicodeObject *u;
6588
6589 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 if (str1->length == str2->length) {
6593 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006594 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 if (str1->length == 1) {
6596 /* replace characters */
6597 Py_UNICODE u1, u2;
6598 if (!findchar(self->str, self->length, str1->str[0]))
6599 goto nothing;
6600 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6601 if (!u)
6602 return NULL;
6603 Py_UNICODE_COPY(u->str, self->str, self->length);
6604 u1 = str1->str[0];
6605 u2 = str2->str[0];
6606 for (i = 0; i < u->length; i++)
6607 if (u->str[i] == u1) {
6608 if (--maxcount < 0)
6609 break;
6610 u->str[i] = u2;
6611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006613 i = fastsearch(
6614 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 if (i < 0)
6617 goto nothing;
6618 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6619 if (!u)
6620 return NULL;
6621 Py_UNICODE_COPY(u->str, self->str, self->length);
6622 while (i <= self->length - str1->length)
6623 if (Py_UNICODE_MATCH(self, i, str1)) {
6624 if (--maxcount < 0)
6625 break;
6626 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6627 i += str1->length;
6628 } else
6629 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006632
6633 Py_ssize_t n, i, j, e;
6634 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 Py_UNICODE *p;
6636
6637 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 if (n > maxcount)
6640 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006641 if (n == 0)
6642 goto nothing;
6643 /* new_size = self->length + n * (str2->length - str1->length)); */
6644 delta = (str2->length - str1->length);
6645 if (delta == 0) {
6646 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006648 product = n * (str2->length - str1->length);
6649 if ((product / (str2->length - str1->length)) != n) {
6650 PyErr_SetString(PyExc_OverflowError,
6651 "replace string is too long");
6652 return NULL;
6653 }
6654 new_size = self->length + product;
6655 if (new_size < 0) {
6656 PyErr_SetString(PyExc_OverflowError,
6657 "replace string is too long");
6658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 }
6660 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006661 u = _PyUnicode_New(new_size);
6662 if (!u)
6663 return NULL;
6664 i = 0;
6665 p = u->str;
6666 e = self->length - str1->length;
6667 if (str1->length > 0) {
6668 while (n-- > 0) {
6669 /* look for next match */
6670 j = i;
6671 while (j <= e) {
6672 if (Py_UNICODE_MATCH(self, j, str1))
6673 break;
6674 j++;
6675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006677 if (j > e)
6678 break;
6679 /* copy unchanged part [i:j] */
6680 Py_UNICODE_COPY(p, self->str+i, j-i);
6681 p += j - i;
6682 }
6683 /* copy substitution string */
6684 if (str2->length > 0) {
6685 Py_UNICODE_COPY(p, str2->str, str2->length);
6686 p += str2->length;
6687 }
6688 i = j + str1->length;
6689 }
6690 if (i < self->length)
6691 /* copy tail [i:] */
6692 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6693 } else {
6694 /* interleave */
6695 while (n > 0) {
6696 Py_UNICODE_COPY(p, str2->str, str2->length);
6697 p += str2->length;
6698 if (--n <= 0)
6699 break;
6700 *p++ = self->str[i++];
6701 }
6702 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006706
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006708 /* nothing to replace; return original string (when possible) */
6709 if (PyUnicode_CheckExact(self)) {
6710 Py_INCREF(self);
6711 return (PyObject *) self;
6712 }
6713 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
6716/* --- Unicode Object Methods --------------------------------------------- */
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720\n\
6721Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006725unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 return fixup(self, fixtitle);
6728}
6729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732\n\
6733Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
6736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006737unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 return fixup(self, fixcapitalize);
6740}
6741
6742#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745\n\
6746Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 PyObject *list;
6753 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006754 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 /* Split into words */
6757 list = split(self, NULL, -1);
6758 if (!list)
6759 return NULL;
6760
6761 /* Capitalize each word */
6762 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6763 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 if (item == NULL)
6766 goto onError;
6767 Py_DECREF(PyList_GET_ITEM(list, i));
6768 PyList_SET_ITEM(list, i, item);
6769 }
6770
6771 /* Join the words to form a new string */
6772 item = PyUnicode_Join(NULL, list);
6773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 Py_DECREF(list);
6776 return (PyObject *)item;
6777}
6778#endif
6779
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006780/* Argument converter. Coerces to a single unicode character */
6781
6782static int
6783convert_uc(PyObject *obj, void *addr)
6784{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006785 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6786 PyObject *uniobj;
6787 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006788
Benjamin Peterson14339b62009-01-31 16:36:08 +00006789 uniobj = PyUnicode_FromObject(obj);
6790 if (uniobj == NULL) {
6791 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006793 return 0;
6794 }
6795 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6796 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006798 Py_DECREF(uniobj);
6799 return 0;
6800 }
6801 unistr = PyUnicode_AS_UNICODE(uniobj);
6802 *fillcharloc = unistr[0];
6803 Py_DECREF(uniobj);
6804 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006805}
6806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006810Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006811done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
6813static PyObject *
6814unicode_center(PyUnicodeObject *self, PyObject *args)
6815{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t marg, left;
6817 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006818 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819
Thomas Woutersde017742006-02-16 19:34:37 +00006820 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 return NULL;
6822
Tim Peters7a29bd52001-09-12 03:03:31 +00006823 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 Py_INCREF(self);
6825 return (PyObject*) self;
6826 }
6827
6828 marg = width - self->length;
6829 left = marg / 2 + (marg & width & 1);
6830
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006831 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832}
6833
Marc-André Lemburge5034372000-08-08 08:04:29 +00006834#if 0
6835
6836/* This code should go into some future Unicode collation support
6837 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006838 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006839
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006840/* speedy UTF-16 code point order comparison */
6841/* gleaned from: */
6842/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6843
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006844static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006845{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006846 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006847 0, 0, 0, 0, 0, 0, 0, 0,
6848 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006849 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006850};
6851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852static int
6853unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006855 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_UNICODE *s1 = str1->str;
6858 Py_UNICODE *s2 = str2->str;
6859
6860 len1 = str1->length;
6861 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006862
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006864 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006865
6866 c1 = *s1++;
6867 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006868
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 if (c1 > (1<<11) * 26)
6870 c1 += utf16Fixup[c1>>11];
6871 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006872 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006873 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006874
6875 if (c1 != c2)
6876 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006877
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006878 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 }
6880
6881 return (len1 < len2) ? -1 : (len1 != len2);
6882}
6883
Marc-André Lemburge5034372000-08-08 08:04:29 +00006884#else
6885
6886static int
6887unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006889 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006890
6891 Py_UNICODE *s1 = str1->str;
6892 Py_UNICODE *s2 = str2->str;
6893
6894 len1 = str1->length;
6895 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006896
Marc-André Lemburge5034372000-08-08 08:04:29 +00006897 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006898 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006899
Fredrik Lundh45714e92001-06-26 16:39:36 +00006900 c1 = *s1++;
6901 c2 = *s2++;
6902
6903 if (c1 != c2)
6904 return (c1 < c2) ? -1 : 1;
6905
Marc-André Lemburge5034372000-08-08 08:04:29 +00006906 len1--; len2--;
6907 }
6908
6909 return (len1 < len2) ? -1 : (len1 != len2);
6910}
6911
6912#endif
6913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006917 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6918 return unicode_compare((PyUnicodeObject *)left,
6919 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006920 PyErr_Format(PyExc_TypeError,
6921 "Can't compare %.100s and %.100s",
6922 left->ob_type->tp_name,
6923 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 return -1;
6925}
6926
Martin v. Löwis5b222132007-06-10 09:51:05 +00006927int
6928PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6929{
6930 int i;
6931 Py_UNICODE *id;
6932 assert(PyUnicode_Check(uni));
6933 id = PyUnicode_AS_UNICODE(uni);
6934 /* Compare Unicode string and source character set string */
6935 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 if (id[i] != str[i])
6937 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006938 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006940 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006942 return 0;
6943}
6944
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006945
Benjamin Peterson29060642009-01-31 22:14:21 +00006946#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006948
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006949PyObject *PyUnicode_RichCompare(PyObject *left,
6950 PyObject *right,
6951 int op)
6952{
6953 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006954
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006955 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6956 PyObject *v;
6957 if (((PyUnicodeObject *) left)->length !=
6958 ((PyUnicodeObject *) right)->length) {
6959 if (op == Py_EQ) {
6960 Py_INCREF(Py_False);
6961 return Py_False;
6962 }
6963 if (op == Py_NE) {
6964 Py_INCREF(Py_True);
6965 return Py_True;
6966 }
6967 }
6968 if (left == right)
6969 result = 0;
6970 else
6971 result = unicode_compare((PyUnicodeObject *)left,
6972 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006973
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006974 /* Convert the return value to a Boolean */
6975 switch (op) {
6976 case Py_EQ:
6977 v = TEST_COND(result == 0);
6978 break;
6979 case Py_NE:
6980 v = TEST_COND(result != 0);
6981 break;
6982 case Py_LE:
6983 v = TEST_COND(result <= 0);
6984 break;
6985 case Py_GE:
6986 v = TEST_COND(result >= 0);
6987 break;
6988 case Py_LT:
6989 v = TEST_COND(result == -1);
6990 break;
6991 case Py_GT:
6992 v = TEST_COND(result == 1);
6993 break;
6994 default:
6995 PyErr_BadArgument();
6996 return NULL;
6997 }
6998 Py_INCREF(v);
6999 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007001
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007002 Py_INCREF(Py_NotImplemented);
7003 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007004}
7005
Guido van Rossum403d68b2000-03-13 15:55:09 +00007006int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007008{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007010 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007011
7012 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 sub = PyUnicode_FromObject(element);
7014 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 PyErr_Format(PyExc_TypeError,
7016 "'in <string>' requires string as left operand, not %s",
7017 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007019 }
7020
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021 str = PyUnicode_FromObject(container);
7022 if (!str) {
7023 Py_DECREF(sub);
7024 return -1;
7025 }
7026
7027 result = stringlib_contains_obj(str, sub);
7028
7029 Py_DECREF(str);
7030 Py_DECREF(sub);
7031
Guido van Rossum403d68b2000-03-13 15:55:09 +00007032 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007033}
7034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035/* Concat to string or Unicode object giving a new Unicode object. */
7036
7037PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
7040 PyUnicodeObject *u = NULL, *v = NULL, *w;
7041
7042 /* Coerce the two arguments */
7043 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7044 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7047 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049
7050 /* Shortcuts */
7051 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 Py_DECREF(v);
7053 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 }
7055 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 Py_DECREF(u);
7057 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 }
7059
7060 /* Concat the two Unicode strings */
7061 w = _PyUnicode_New(u->length + v->length);
7062 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 Py_UNICODE_COPY(w->str, u->str, u->length);
7065 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7066
7067 Py_DECREF(u);
7068 Py_DECREF(v);
7069 return (PyObject *)w;
7070
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 Py_XDECREF(u);
7073 Py_XDECREF(v);
7074 return NULL;
7075}
7076
Walter Dörwald1ab83302007-05-18 17:15:44 +00007077void
7078PyUnicode_Append(PyObject **pleft, PyObject *right)
7079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007080 PyObject *new;
7081 if (*pleft == NULL)
7082 return;
7083 if (right == NULL || !PyUnicode_Check(*pleft)) {
7084 Py_DECREF(*pleft);
7085 *pleft = NULL;
7086 return;
7087 }
7088 new = PyUnicode_Concat(*pleft, right);
7089 Py_DECREF(*pleft);
7090 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007091}
7092
7093void
7094PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7095{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007096 PyUnicode_Append(pleft, right);
7097 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007098}
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007103Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007104string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007105interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
7107static PyObject *
7108unicode_count(PyUnicodeObject *self, PyObject *args)
7109{
7110 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007112 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 PyObject *result;
7114
Guido van Rossumb8872e62000-05-09 14:14:27 +00007115 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 return NULL;
7118
7119 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007120 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007123
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125
Christian Heimes217cfd12007-12-02 14:31:20 +00007126 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007127 stringlib_count(self->str + start, end - start,
7128 substring->str, substring->length)
7129 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007132
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 return result;
7134}
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007139Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007140to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007141handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7143'xmlcharrefreplace' as well as any other name registered with\n\
7144codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject *
7147unicode_encode(PyUnicodeObject *self, PyObject *args)
7148{
7149 char *encoding = NULL;
7150 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007151 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007152
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7154 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007155 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007156 if (v == NULL)
7157 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007158 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007159 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007160 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007161 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007162 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007163 Py_DECREF(v);
7164 return NULL;
7165 }
7166 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007167
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007169 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007170}
7171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174\n\
7175Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
7178static PyObject*
7179unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7180{
7181 Py_UNICODE *e;
7182 Py_UNICODE *p;
7183 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007184 Py_UNICODE *qe;
7185 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 PyUnicodeObject *u;
7187 int tabsize = 8;
7188
7189 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
Thomas Wouters7e474022000-07-16 12:04:32 +00007192 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007193 i = 0; /* chars up to and including most recent \n or \r */
7194 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7195 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 for (p = self->str; p < e; p++)
7197 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 if (tabsize > 0) {
7199 incr = tabsize - (j % tabsize); /* cannot overflow */
7200 if (j > PY_SSIZE_T_MAX - incr)
7201 goto overflow1;
7202 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 if (j > PY_SSIZE_T_MAX - 1)
7207 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 j++;
7209 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 if (i > PY_SSIZE_T_MAX - j)
7211 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007213 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 }
7215 }
7216
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007217 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007219
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 /* Second pass: create output string and fill it */
7221 u = _PyUnicode_New(i + j);
7222 if (!u)
7223 return NULL;
7224
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007225 j = 0; /* same as in first pass */
7226 q = u->str; /* next output char */
7227 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228
7229 for (p = self->str; p < e; p++)
7230 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 if (tabsize > 0) {
7232 i = tabsize - (j % tabsize);
7233 j += i;
7234 while (i--) {
7235 if (q >= qe)
7236 goto overflow2;
7237 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007238 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 else {
7242 if (q >= qe)
7243 goto overflow2;
7244 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007245 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 if (*p == '\n' || *p == '\r')
7247 j = 0;
7248 }
7249
7250 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007251
7252 overflow2:
7253 Py_DECREF(u);
7254 overflow1:
7255 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257}
7258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007259PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261\n\
7262Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007263such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264arguments start and end are interpreted as in slice notation.\n\
7265\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007266Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267
7268static PyObject *
7269unicode_find(PyUnicodeObject *self, PyObject *args)
7270{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007271 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007272 Py_ssize_t start;
7273 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007274 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
Christian Heimes9cd17752007-11-18 19:35:23 +00007276 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278
Thomas Wouters477c8d52006-05-27 19:21:47 +00007279 result = stringlib_find_slice(
7280 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7281 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7282 start, end
7283 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007286
Christian Heimes217cfd12007-12-02 14:31:20 +00007287 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
7290static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292{
7293 if (index < 0 || index >= self->length) {
7294 PyErr_SetString(PyExc_IndexError, "string index out of range");
7295 return NULL;
7296 }
7297
7298 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7299}
7300
Guido van Rossumc2504932007-09-18 19:42:40 +00007301/* Believe it or not, this produces the same value for ASCII strings
7302 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007304unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305{
Guido van Rossumc2504932007-09-18 19:42:40 +00007306 Py_ssize_t len;
7307 Py_UNICODE *p;
7308 long x;
7309
7310 if (self->hash != -1)
7311 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007312 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007313 p = self->str;
7314 x = *p << 7;
7315 while (--len >= 0)
7316 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007317 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007318 if (x == -1)
7319 x = -2;
7320 self->hash = x;
7321 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322}
7323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007324PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007327Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328
7329static PyObject *
7330unicode_index(PyUnicodeObject *self, PyObject *args)
7331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007333 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007334 Py_ssize_t start;
7335 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336
Christian Heimes9cd17752007-11-18 19:35:23 +00007337 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
Thomas Wouters477c8d52006-05-27 19:21:47 +00007340 result = stringlib_find_slice(
7341 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7342 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7343 start, end
7344 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
7346 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 if (result < 0) {
7349 PyErr_SetString(PyExc_ValueError, "substring not found");
7350 return NULL;
7351 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007352
Christian Heimes217cfd12007-12-02 14:31:20 +00007353 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354}
7355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007356PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007359Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007360at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007363unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364{
7365 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7366 register const Py_UNICODE *e;
7367 int cased;
7368
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 /* Shortcut for single character strings */
7370 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007373 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007374 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 e = p + PyUnicode_GET_SIZE(self);
7378 cased = 0;
7379 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007381
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7383 return PyBool_FromLong(0);
7384 else if (!cased && Py_UNICODE_ISLOWER(ch))
7385 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007387 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
7389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007393Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007394at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
7396static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007397unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398{
7399 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7400 register const Py_UNICODE *e;
7401 int cased;
7402
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 /* Shortcut for single character strings */
7404 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007407 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007408 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 e = p + PyUnicode_GET_SIZE(self);
7412 cased = 0;
7413 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007415
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7417 return PyBool_FromLong(0);
7418 else if (!cased && Py_UNICODE_ISUPPER(ch))
7419 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007421 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007427Return True if S is a titlecased string and there is at least one\n\
7428character in S, i.e. upper- and titlecase characters may only\n\
7429follow uncased characters and lowercase characters only cased ones.\n\
7430Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431
7432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007433unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434{
7435 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7436 register const Py_UNICODE *e;
7437 int cased, previous_is_cased;
7438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 /* Shortcut for single character strings */
7440 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7442 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007444 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007445 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007447
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 e = p + PyUnicode_GET_SIZE(self);
7449 cased = 0;
7450 previous_is_cased = 0;
7451 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007453
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7455 if (previous_is_cased)
7456 return PyBool_FromLong(0);
7457 previous_is_cased = 1;
7458 cased = 1;
7459 }
7460 else if (Py_UNICODE_ISLOWER(ch)) {
7461 if (!previous_is_cased)
7462 return PyBool_FromLong(0);
7463 previous_is_cased = 1;
7464 cased = 1;
7465 }
7466 else
7467 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007469 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470}
7471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007475Return True if all characters in S are whitespace\n\
7476and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
7478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007479unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480{
7481 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7482 register const Py_UNICODE *e;
7483
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 /* Shortcut for single character strings */
7485 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 Py_UNICODE_ISSPACE(*p))
7487 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007489 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007490 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007492
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 e = p + PyUnicode_GET_SIZE(self);
7494 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 if (!Py_UNICODE_ISSPACE(*p))
7496 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499}
7500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007503\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007504Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007505and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007506
7507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007508unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007509{
7510 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7511 register const Py_UNICODE *e;
7512
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007513 /* Shortcut for single character strings */
7514 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 Py_UNICODE_ISALPHA(*p))
7516 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007517
7518 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007519 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007521
7522 e = p + PyUnicode_GET_SIZE(self);
7523 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 if (!Py_UNICODE_ISALPHA(*p))
7525 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007526 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007527 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007528}
7529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007530PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007532\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007533Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007535
7536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007537unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007538{
7539 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7540 register const Py_UNICODE *e;
7541
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007542 /* Shortcut for single character strings */
7543 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 Py_UNICODE_ISALNUM(*p))
7545 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007546
7547 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007548 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007550
7551 e = p + PyUnicode_GET_SIZE(self);
7552 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 if (!Py_UNICODE_ISALNUM(*p))
7554 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007555 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007556 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007562Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564
7565static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007566unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567{
7568 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7569 register const Py_UNICODE *e;
7570
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 /* Shortcut for single character strings */
7572 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 Py_UNICODE_ISDECIMAL(*p))
7574 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007576 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007577 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007579
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 e = p + PyUnicode_GET_SIZE(self);
7581 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 if (!Py_UNICODE_ISDECIMAL(*p))
7583 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007585 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586}
7587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007588PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007591Return True if all characters in S are digits\n\
7592and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007595unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596{
7597 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7598 register const Py_UNICODE *e;
7599
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 /* Shortcut for single character strings */
7601 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 Py_UNICODE_ISDIGIT(*p))
7603 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007605 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007606 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007608
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 e = p + PyUnicode_GET_SIZE(self);
7610 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 if (!Py_UNICODE_ISDIGIT(*p))
7612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615}
7616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007620Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007624unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625{
7626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7627 register const Py_UNICODE *e;
7628
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 /* Shortcut for single character strings */
7630 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 Py_UNICODE_ISNUMERIC(*p))
7632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007634 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007635 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007637
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 e = p + PyUnicode_GET_SIZE(self);
7639 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 if (!Py_UNICODE_ISNUMERIC(*p))
7641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Martin v. Löwis47383402007-08-15 07:32:56 +00007646int
7647PyUnicode_IsIdentifier(PyObject *self)
7648{
7649 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7650 register const Py_UNICODE *e;
7651
7652 /* Special case for empty strings */
7653 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007655
7656 /* PEP 3131 says that the first character must be in
7657 XID_Start and subsequent characters in XID_Continue,
7658 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007659 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007660 letters, digits, underscore). However, given the current
7661 definition of XID_Start and XID_Continue, it is sufficient
7662 to check just for these, except that _ must be allowed
7663 as starting an identifier. */
7664 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7665 return 0;
7666
7667 e = p + PyUnicode_GET_SIZE(self);
7668 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 if (!_PyUnicode_IsXidContinue(*p))
7670 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007671 }
7672 return 1;
7673}
7674
7675PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007677\n\
7678Return True if S is a valid identifier according\n\
7679to the language definition.");
7680
7681static PyObject*
7682unicode_isidentifier(PyObject *self)
7683{
7684 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7685}
7686
Georg Brandl559e5d72008-06-11 18:37:52 +00007687PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007689\n\
7690Return True if all characters in S are considered\n\
7691printable in repr() or S is empty, False otherwise.");
7692
7693static PyObject*
7694unicode_isprintable(PyObject *self)
7695{
7696 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7697 register const Py_UNICODE *e;
7698
7699 /* Shortcut for single character strings */
7700 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7701 Py_RETURN_TRUE;
7702 }
7703
7704 e = p + PyUnicode_GET_SIZE(self);
7705 for (; p < e; p++) {
7706 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7707 Py_RETURN_FALSE;
7708 }
7709 }
7710 Py_RETURN_TRUE;
7711}
7712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715\n\
7716Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
7719static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007720unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007722 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723}
7724
Martin v. Löwis18e16552006-02-15 17:27:45 +00007725static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726unicode_length(PyUnicodeObject *self)
7727{
7728 return self->length;
7729}
7730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007731PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007734Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007735done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737static PyObject *
7738unicode_ljust(PyUnicodeObject *self, PyObject *args)
7739{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007740 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007741 Py_UNICODE fillchar = ' ';
7742
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007743 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 return NULL;
7745
Tim Peters7a29bd52001-09-12 03:03:31 +00007746 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 Py_INCREF(self);
7748 return (PyObject*) self;
7749 }
7750
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007751 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752}
7753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007757Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758
7759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007760unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 return fixup(self, fixlower);
7763}
7764
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007765#define LEFTSTRIP 0
7766#define RIGHTSTRIP 1
7767#define BOTHSTRIP 2
7768
7769/* Arrays indexed by above */
7770static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7771
7772#define STRIPNAME(i) (stripformat[i]+3)
7773
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007774/* externally visible for str.strip(unicode) */
7775PyObject *
7776_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007778 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7779 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7780 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7781 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7782 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007783
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007785
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786 i = 0;
7787 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7789 i++;
7790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007792
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 j = len;
7794 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 do {
7796 j--;
7797 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7798 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007800
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 Py_INCREF(self);
7803 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 }
7805 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007807}
7808
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
7810static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007813 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7814 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007815
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816 i = 0;
7817 if (striptype != RIGHTSTRIP) {
7818 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7819 i++;
7820 }
7821 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 j = len;
7824 if (striptype != LEFTSTRIP) {
7825 do {
7826 j--;
7827 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7828 j++;
7829 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007830
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7832 Py_INCREF(self);
7833 return (PyObject*)self;
7834 }
7835 else
7836 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837}
7838
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839
7840static PyObject *
7841do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7842{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007843 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007844
Benjamin Peterson14339b62009-01-31 16:36:08 +00007845 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7846 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847
Benjamin Peterson14339b62009-01-31 16:36:08 +00007848 if (sep != NULL && sep != Py_None) {
7849 if (PyUnicode_Check(sep))
7850 return _PyUnicode_XStrip(self, striptype, sep);
7851 else {
7852 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 "%s arg must be None or str",
7854 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007855 return NULL;
7856 }
7857 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007858
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007860}
7861
7862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007863PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007865\n\
7866Return a copy of the string S with leading and trailing\n\
7867whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007868If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007869
7870static PyObject *
7871unicode_strip(PyUnicodeObject *self, PyObject *args)
7872{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873 if (PyTuple_GET_SIZE(args) == 0)
7874 return do_strip(self, BOTHSTRIP); /* Common case */
7875 else
7876 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007877}
7878
7879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007880PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007882\n\
7883Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007884If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007885
7886static PyObject *
7887unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 if (PyTuple_GET_SIZE(args) == 0)
7890 return do_strip(self, LEFTSTRIP); /* Common case */
7891 else
7892 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007893}
7894
7895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007896PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007898\n\
7899Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007900If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007901
7902static PyObject *
7903unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7904{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007905 if (PyTuple_GET_SIZE(args) == 0)
7906 return do_strip(self, RIGHTSTRIP); /* Common case */
7907 else
7908 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007909}
7910
7911
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914{
7915 PyUnicodeObject *u;
7916 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007917 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007918 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919
Georg Brandl222de0f2009-04-12 12:01:50 +00007920 if (len < 1) {
7921 Py_INCREF(unicode_empty);
7922 return (PyObject *)unicode_empty;
7923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924
Tim Peters7a29bd52001-09-12 03:03:31 +00007925 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 /* no repeat, return original string */
7927 Py_INCREF(str);
7928 return (PyObject*) str;
7929 }
Tim Peters8f422462000-09-09 06:13:41 +00007930
7931 /* ensure # of chars needed doesn't overflow int and # of bytes
7932 * needed doesn't overflow size_t
7933 */
7934 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007935 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007936 PyErr_SetString(PyExc_OverflowError,
7937 "repeated string is too long");
7938 return NULL;
7939 }
7940 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7941 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7942 PyErr_SetString(PyExc_OverflowError,
7943 "repeated string is too long");
7944 return NULL;
7945 }
7946 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 if (!u)
7948 return NULL;
7949
7950 p = u->str;
7951
Georg Brandl222de0f2009-04-12 12:01:50 +00007952 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007953 Py_UNICODE_FILL(p, str->str[0], len);
7954 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007955 Py_ssize_t done = str->length; /* number of characters copied this far */
7956 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007958 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007959 Py_UNICODE_COPY(p+done, p, n);
7960 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 }
7963
7964 return (PyObject*) u;
7965}
7966
7967PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 PyObject *subobj,
7969 PyObject *replobj,
7970 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
7972 PyObject *self;
7973 PyObject *str1;
7974 PyObject *str2;
7975 PyObject *result;
7976
7977 self = PyUnicode_FromObject(obj);
7978 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 str1 = PyUnicode_FromObject(subobj);
7981 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_DECREF(self);
7983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 }
7985 str2 = PyUnicode_FromObject(replobj);
7986 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 Py_DECREF(self);
7988 Py_DECREF(str1);
7989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 }
Tim Petersced69f82003-09-16 20:30:58 +00007991 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 (PyUnicodeObject *)str1,
7993 (PyUnicodeObject *)str2,
7994 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 Py_DECREF(self);
7996 Py_DECREF(str1);
7997 Py_DECREF(str2);
7998 return result;
7999}
8000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008001PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003\n\
8004Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008005old replaced by new. If the optional argument count is\n\
8006given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007
8008static PyObject*
8009unicode_replace(PyUnicodeObject *self, PyObject *args)
8010{
8011 PyUnicodeObject *str1;
8012 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 PyObject *result;
8015
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 return NULL;
8018 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8019 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008022 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 Py_DECREF(str1);
8024 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
8027 result = replace(self, str1, str2, maxcount);
8028
8029 Py_DECREF(str1);
8030 Py_DECREF(str2);
8031 return result;
8032}
8033
8034static
8035PyObject *unicode_repr(PyObject *unicode)
8036{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008037 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008038 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008039 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8040 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8041
8042 /* XXX(nnorwitz): rather than over-allocating, it would be
8043 better to choose a different scheme. Perhaps scan the
8044 first N-chars of the string and allocate based on that size.
8045 */
8046 /* Initial allocation is based on the longest-possible unichr
8047 escape.
8048
8049 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8050 unichr, so in this case it's the longest unichr escape. In
8051 narrow (UTF-16) builds this is five chars per source unichr
8052 since there are two unichrs in the surrogate pair, so in narrow
8053 (UTF-16) builds it's not the longest unichr escape.
8054
8055 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8056 so in the narrow (UTF-16) build case it's the longest unichr
8057 escape.
8058 */
8059
Walter Dörwald1ab83302007-05-18 17:15:44 +00008060 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008062#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008064#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008066#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008068 if (repr == NULL)
8069 return NULL;
8070
Walter Dörwald1ab83302007-05-18 17:15:44 +00008071 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008072
8073 /* Add quote */
8074 *p++ = (findchar(s, size, '\'') &&
8075 !findchar(s, size, '"')) ? '"' : '\'';
8076 while (size-- > 0) {
8077 Py_UNICODE ch = *s++;
8078
8079 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008080 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008081 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008082 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008083 continue;
8084 }
8085
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008087 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008088 *p++ = '\\';
8089 *p++ = 't';
8090 }
8091 else if (ch == '\n') {
8092 *p++ = '\\';
8093 *p++ = 'n';
8094 }
8095 else if (ch == '\r') {
8096 *p++ = '\\';
8097 *p++ = 'r';
8098 }
8099
8100 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008101 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008102 *p++ = '\\';
8103 *p++ = 'x';
8104 *p++ = hexdigits[(ch >> 4) & 0x000F];
8105 *p++ = hexdigits[ch & 0x000F];
8106 }
8107
Georg Brandl559e5d72008-06-11 18:37:52 +00008108 /* Copy ASCII characters as-is */
8109 else if (ch < 0x7F) {
8110 *p++ = ch;
8111 }
8112
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008114 else {
8115 Py_UCS4 ucs = ch;
8116
8117#ifndef Py_UNICODE_WIDE
8118 Py_UNICODE ch2 = 0;
8119 /* Get code point from surrogate pair */
8120 if (size > 0) {
8121 ch2 = *s;
8122 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008124 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008127 size--;
8128 }
8129 }
8130#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008132 (categories Z* and C* except ASCII space)
8133 */
8134 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8135 /* Map 8-bit characters to '\xhh' */
8136 if (ucs <= 0xff) {
8137 *p++ = '\\';
8138 *p++ = 'x';
8139 *p++ = hexdigits[(ch >> 4) & 0x000F];
8140 *p++ = hexdigits[ch & 0x000F];
8141 }
8142 /* Map 21-bit characters to '\U00xxxxxx' */
8143 else if (ucs >= 0x10000) {
8144 *p++ = '\\';
8145 *p++ = 'U';
8146 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8147 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8148 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8149 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8150 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8151 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8152 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8153 *p++ = hexdigits[ucs & 0x0000000F];
8154 }
8155 /* Map 16-bit characters to '\uxxxx' */
8156 else {
8157 *p++ = '\\';
8158 *p++ = 'u';
8159 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8160 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8161 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8162 *p++ = hexdigits[ucs & 0x000F];
8163 }
8164 }
8165 /* Copy characters as-is */
8166 else {
8167 *p++ = ch;
8168#ifndef Py_UNICODE_WIDE
8169 if (ucs >= 0x10000)
8170 *p++ = ch2;
8171#endif
8172 }
8173 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008174 }
8175 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008176 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008177
8178 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008179 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008180 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181}
8182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008183PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185\n\
8186Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008187such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188arguments start and end are interpreted as in slice notation.\n\
8189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008190Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192static PyObject *
8193unicode_rfind(PyUnicodeObject *self, PyObject *args)
8194{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008196 Py_ssize_t start;
8197 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199
Christian Heimes9cd17752007-11-18 19:35:23 +00008200 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202
Thomas Wouters477c8d52006-05-27 19:21:47 +00008203 result = stringlib_rfind_slice(
8204 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8205 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8206 start, end
8207 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208
8209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210
Christian Heimes217cfd12007-12-02 14:31:20 +00008211 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212}
8213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008214PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008217Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218
8219static PyObject *
8220unicode_rindex(PyUnicodeObject *self, PyObject *args)
8221{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008223 Py_ssize_t start;
8224 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008225 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226
Christian Heimes9cd17752007-11-18 19:35:23 +00008227 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229
Thomas Wouters477c8d52006-05-27 19:21:47 +00008230 result = stringlib_rfind_slice(
8231 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8232 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8233 start, end
8234 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235
8236 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 if (result < 0) {
8239 PyErr_SetString(PyExc_ValueError, "substring not found");
8240 return NULL;
8241 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008242 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243}
8244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008245PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008248Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008249done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250
8251static PyObject *
8252unicode_rjust(PyUnicodeObject *self, PyObject *args)
8253{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008254 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008255 Py_UNICODE fillchar = ' ';
8256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008257 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 return NULL;
8259
Tim Peters7a29bd52001-09-12 03:03:31 +00008260 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 Py_INCREF(self);
8262 return (PyObject*) self;
8263 }
8264
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008265 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266}
8267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyObject *sep,
8270 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
8272 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008273
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 s = PyUnicode_FromObject(s);
8275 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 if (sep != NULL) {
8278 sep = PyUnicode_FromObject(sep);
8279 if (sep == NULL) {
8280 Py_DECREF(s);
8281 return NULL;
8282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
8284
8285 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8286
8287 Py_DECREF(s);
8288 Py_XDECREF(sep);
8289 return result;
8290}
8291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008292PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294\n\
8295Return a list of the words in S, using sep as the\n\
8296delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008297splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008298whitespace string is a separator and empty strings are\n\
8299removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
8301static PyObject*
8302unicode_split(PyUnicodeObject *self, PyObject *args)
8303{
8304 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008305 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 return NULL;
8309
8310 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
Thomas Wouters477c8d52006-05-27 19:21:47 +00008318PyObject *
8319PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8320{
8321 PyObject* str_obj;
8322 PyObject* sep_obj;
8323 PyObject* out;
8324
8325 str_obj = PyUnicode_FromObject(str_in);
8326 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008328 sep_obj = PyUnicode_FromObject(sep_in);
8329 if (!sep_obj) {
8330 Py_DECREF(str_obj);
8331 return NULL;
8332 }
8333
8334 out = stringlib_partition(
8335 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8336 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8337 );
8338
8339 Py_DECREF(sep_obj);
8340 Py_DECREF(str_obj);
8341
8342 return out;
8343}
8344
8345
8346PyObject *
8347PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8348{
8349 PyObject* str_obj;
8350 PyObject* sep_obj;
8351 PyObject* out;
8352
8353 str_obj = PyUnicode_FromObject(str_in);
8354 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008356 sep_obj = PyUnicode_FromObject(sep_in);
8357 if (!sep_obj) {
8358 Py_DECREF(str_obj);
8359 return NULL;
8360 }
8361
8362 out = stringlib_rpartition(
8363 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8364 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8365 );
8366
8367 Py_DECREF(sep_obj);
8368 Py_DECREF(str_obj);
8369
8370 return out;
8371}
8372
8373PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008375\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008376Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008377the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008378found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379
8380static PyObject*
8381unicode_partition(PyUnicodeObject *self, PyObject *separator)
8382{
8383 return PyUnicode_Partition((PyObject *)self, separator);
8384}
8385
8386PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008388\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008389Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008390the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008391separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392
8393static PyObject*
8394unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8395{
8396 return PyUnicode_RPartition((PyObject *)self, separator);
8397}
8398
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008399PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 PyObject *sep,
8401 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008402{
8403 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008405 s = PyUnicode_FromObject(s);
8406 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 if (sep != NULL) {
8409 sep = PyUnicode_FromObject(sep);
8410 if (sep == NULL) {
8411 Py_DECREF(s);
8412 return NULL;
8413 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008414 }
8415
8416 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8417
8418 Py_DECREF(s);
8419 Py_XDECREF(sep);
8420 return result;
8421}
8422
8423PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008425\n\
8426Return a list of the words in S, using sep as the\n\
8427delimiter string, starting at the end of the string and\n\
8428working to the front. If maxsplit is given, at most maxsplit\n\
8429splits are done. If sep is not specified, any whitespace string\n\
8430is a separator.");
8431
8432static PyObject*
8433unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8434{
8435 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008437
Martin v. Löwis18e16552006-02-15 17:27:45 +00008438 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008439 return NULL;
8440
8441 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008443 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008445 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008447}
8448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008449PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451\n\
8452Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008453Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008454is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
8456static PyObject*
8457unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8458{
Guido van Rossum86662912000-04-11 15:38:46 +00008459 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460
Guido van Rossum86662912000-04-11 15:38:46 +00008461 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 return NULL;
8463
Guido van Rossum86662912000-04-11 15:38:46 +00008464 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465}
8466
8467static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008468PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Walter Dörwald346737f2007-05-31 10:44:43 +00008470 if (PyUnicode_CheckExact(self)) {
8471 Py_INCREF(self);
8472 return self;
8473 } else
8474 /* Subtype -- return genuine unicode string with the same value. */
8475 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8476 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477}
8478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008479PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481\n\
8482Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
8485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008486unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 return fixup(self, fixswapcase);
8489}
8490
Georg Brandlceee0772007-11-27 23:48:05 +00008491PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008493\n\
8494Return a translation table usable for str.translate().\n\
8495If there is only one argument, it must be a dictionary mapping Unicode\n\
8496ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008497Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008498If there are two arguments, they must be strings of equal length, and\n\
8499in the resulting dictionary, each character in x will be mapped to the\n\
8500character at the same position in y. If there is a third argument, it\n\
8501must be a string, whose characters will be mapped to None in the result.");
8502
8503static PyObject*
8504unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8505{
8506 PyObject *x, *y = NULL, *z = NULL;
8507 PyObject *new = NULL, *key, *value;
8508 Py_ssize_t i = 0;
8509 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510
Georg Brandlceee0772007-11-27 23:48:05 +00008511 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8512 return NULL;
8513 new = PyDict_New();
8514 if (!new)
8515 return NULL;
8516 if (y != NULL) {
8517 /* x must be a string too, of equal length */
8518 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8519 if (!PyUnicode_Check(x)) {
8520 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8521 "be a string if there is a second argument");
8522 goto err;
8523 }
8524 if (PyUnicode_GET_SIZE(x) != ylen) {
8525 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8526 "arguments must have equal length");
8527 goto err;
8528 }
8529 /* create entries for translating chars in x to those in y */
8530 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008531 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8532 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008533 if (!key || !value)
8534 goto err;
8535 res = PyDict_SetItem(new, key, value);
8536 Py_DECREF(key);
8537 Py_DECREF(value);
8538 if (res < 0)
8539 goto err;
8540 }
8541 /* create entries for deleting chars in z */
8542 if (z != NULL) {
8543 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008544 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008545 if (!key)
8546 goto err;
8547 res = PyDict_SetItem(new, key, Py_None);
8548 Py_DECREF(key);
8549 if (res < 0)
8550 goto err;
8551 }
8552 }
8553 } else {
8554 /* x must be a dict */
8555 if (!PyDict_Check(x)) {
8556 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8557 "to maketrans it must be a dict");
8558 goto err;
8559 }
8560 /* copy entries into the new dict, converting string keys to int keys */
8561 while (PyDict_Next(x, &i, &key, &value)) {
8562 if (PyUnicode_Check(key)) {
8563 /* convert string keys to integer keys */
8564 PyObject *newkey;
8565 if (PyUnicode_GET_SIZE(key) != 1) {
8566 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8567 "table must be of length 1");
8568 goto err;
8569 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008570 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008571 if (!newkey)
8572 goto err;
8573 res = PyDict_SetItem(new, newkey, value);
8574 Py_DECREF(newkey);
8575 if (res < 0)
8576 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008577 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008578 /* just keep integer keys */
8579 if (PyDict_SetItem(new, key, value) < 0)
8580 goto err;
8581 } else {
8582 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8583 "be strings or integers");
8584 goto err;
8585 }
8586 }
8587 }
8588 return new;
8589 err:
8590 Py_DECREF(new);
8591 return NULL;
8592}
8593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008594PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596\n\
8597Return a copy of the string S, where all characters have been mapped\n\
8598through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008599Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008600Unmapped characters are left untouched. Characters mapped to None\n\
8601are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602
8603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008604unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
Georg Brandlceee0772007-11-27 23:48:05 +00008606 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607}
8608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008609PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008612Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
8614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008615unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 return fixup(self, fixupper);
8618}
8619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008620PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008623Pad a numeric string S with zeros on the left, to fill a field\n\
8624of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
8626static PyObject *
8627unicode_zfill(PyUnicodeObject *self, PyObject *args)
8628{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008629 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 PyUnicodeObject *u;
8631
Martin v. Löwis18e16552006-02-15 17:27:45 +00008632 Py_ssize_t width;
8633 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 return NULL;
8635
8636 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008637 if (PyUnicode_CheckExact(self)) {
8638 Py_INCREF(self);
8639 return (PyObject*) self;
8640 }
8641 else
8642 return PyUnicode_FromUnicode(
8643 PyUnicode_AS_UNICODE(self),
8644 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
8647
8648 fill = width - self->length;
8649
8650 u = pad(self, fill, 0, '0');
8651
Walter Dörwald068325e2002-04-15 13:36:47 +00008652 if (u == NULL)
8653 return NULL;
8654
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 if (u->str[fill] == '+' || u->str[fill] == '-') {
8656 /* move sign to beginning of string */
8657 u->str[0] = u->str[fill];
8658 u->str[fill] = '0';
8659 }
8660
8661 return (PyObject*) u;
8662}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663
8664#if 0
8665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008666unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Christian Heimes2202f872008-02-06 14:31:34 +00008668 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669}
8670#endif
8671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008672PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008675Return True if S starts with the specified prefix, False otherwise.\n\
8676With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008677With optional end, stop comparing S at that position.\n\
8678prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
8680static PyObject *
8681unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008684 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008686 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008687 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008688 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008690 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8692 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008693 if (PyTuple_Check(subobj)) {
8694 Py_ssize_t i;
8695 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8696 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008698 if (substring == NULL)
8699 return NULL;
8700 result = tailmatch(self, substring, start, end, -1);
8701 Py_DECREF(substring);
8702 if (result) {
8703 Py_RETURN_TRUE;
8704 }
8705 }
8706 /* nothing matched */
8707 Py_RETURN_FALSE;
8708 }
8709 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008712 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008714 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715}
8716
8717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008718PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008721Return True if S ends with the specified suffix, False otherwise.\n\
8722With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008723With optional end, stop comparing S at that position.\n\
8724suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725
8726static PyObject *
8727unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008732 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008733 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008734 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008736 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8738 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008739 if (PyTuple_Check(subobj)) {
8740 Py_ssize_t i;
8741 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8742 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008744 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008746 result = tailmatch(self, substring, start, end, +1);
8747 Py_DECREF(substring);
8748 if (result) {
8749 Py_RETURN_TRUE;
8750 }
8751 }
8752 Py_RETURN_FALSE;
8753 }
8754 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008758 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008760 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761}
8762
Eric Smith8c663262007-08-25 02:26:07 +00008763#include "stringlib/string_format.h"
8764
8765PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008767\n\
8768");
8769
Eric Smith4a7d76d2008-05-30 18:10:19 +00008770static PyObject *
8771unicode__format__(PyObject* self, PyObject* args)
8772{
8773 PyObject *format_spec;
8774
8775 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8776 return NULL;
8777
8778 return _PyUnicode_FormatAdvanced(self,
8779 PyUnicode_AS_UNICODE(format_spec),
8780 PyUnicode_GET_SIZE(format_spec));
8781}
8782
Eric Smith8c663262007-08-25 02:26:07 +00008783PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008785\n\
8786");
8787
8788static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008789unicode__sizeof__(PyUnicodeObject *v)
8790{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008791 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8792 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008793}
8794
8795PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008797
8798static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008799unicode_getnewargs(PyUnicodeObject *v)
8800{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008801 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008802}
8803
8804
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805static PyMethodDef unicode_methods[] = {
8806
8807 /* Order is according to common usage: often used methods should
8808 appear first, since lookup is done sequentially. */
8809
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008810 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8811 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8812 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008813 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008814 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8815 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8816 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8817 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8818 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8819 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8820 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008821 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008822 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8823 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8824 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008825 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008826 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8827 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8828 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008829 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008830 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008831 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008832 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008833 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8834 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8835 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8836 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8837 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8838 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8839 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8840 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8841 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8842 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8843 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8844 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8845 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8846 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008847 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008848 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008849 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008850 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008851 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008852 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8853 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008854 {"maketrans", (PyCFunction) unicode_maketrans,
8855 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008856 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008857#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008858 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859#endif
8860
8861#if 0
8862 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008863 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864#endif
8865
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 {NULL, NULL}
8868};
8869
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008870static PyObject *
8871unicode_mod(PyObject *v, PyObject *w)
8872{
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 if (!PyUnicode_Check(v)) {
8874 Py_INCREF(Py_NotImplemented);
8875 return Py_NotImplemented;
8876 }
8877 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008878}
8879
8880static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008881 0, /*nb_add*/
8882 0, /*nb_subtract*/
8883 0, /*nb_multiply*/
8884 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008885};
8886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 (lenfunc) unicode_length, /* sq_length */
8889 PyUnicode_Concat, /* sq_concat */
8890 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8891 (ssizeargfunc) unicode_getitem, /* sq_item */
8892 0, /* sq_slice */
8893 0, /* sq_ass_item */
8894 0, /* sq_ass_slice */
8895 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896};
8897
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008898static PyObject*
8899unicode_subscript(PyUnicodeObject* self, PyObject* item)
8900{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008901 if (PyIndex_Check(item)) {
8902 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008903 if (i == -1 && PyErr_Occurred())
8904 return NULL;
8905 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008906 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008907 return unicode_getitem(self, i);
8908 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008909 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008910 Py_UNICODE* source_buf;
8911 Py_UNICODE* result_buf;
8912 PyObject* result;
8913
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008914 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008916 return NULL;
8917 }
8918
8919 if (slicelength <= 0) {
8920 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008921 } else if (start == 0 && step == 1 && slicelength == self->length &&
8922 PyUnicode_CheckExact(self)) {
8923 Py_INCREF(self);
8924 return (PyObject *)self;
8925 } else if (step == 1) {
8926 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008927 } else {
8928 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008929 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8930 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008931
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 if (result_buf == NULL)
8933 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008934
8935 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8936 result_buf[i] = source_buf[cur];
8937 }
Tim Petersced69f82003-09-16 20:30:58 +00008938
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008939 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008940 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008941 return result;
8942 }
8943 } else {
8944 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8945 return NULL;
8946 }
8947}
8948
8949static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008950 (lenfunc)unicode_length, /* mp_length */
8951 (binaryfunc)unicode_subscript, /* mp_subscript */
8952 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008953};
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956/* Helpers for PyUnicode_Format() */
8957
8958static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008959getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008961 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 (*p_argidx)++;
8964 if (arglen < 0)
8965 return args;
8966 else
8967 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 }
8969 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 return NULL;
8972}
8973
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008974/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008976static PyObject *
8977formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008979 char *p;
8980 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008982
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 x = PyFloat_AsDouble(v);
8984 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008985 return NULL;
8986
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008989
Eric Smith0923d1d2009-04-16 20:16:10 +00008990 p = PyOS_double_to_string(x, type, prec,
8991 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008992 if (p == NULL)
8993 return NULL;
8994 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008995 PyMem_Free(p);
8996 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997}
8998
Tim Peters38fd5b62000-09-21 05:43:11 +00008999static PyObject*
9000formatlong(PyObject *val, int flags, int prec, int type)
9001{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009002 char *buf;
9003 int len;
9004 PyObject *str; /* temporary string object. */
9005 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009006
Benjamin Peterson14339b62009-01-31 16:36:08 +00009007 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9008 if (!str)
9009 return NULL;
9010 result = PyUnicode_FromStringAndSize(buf, len);
9011 Py_DECREF(str);
9012 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009013}
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015static int
9016formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009017 size_t buflen,
9018 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009020 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009021 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 if (PyUnicode_GET_SIZE(v) == 1) {
9023 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9024 buf[1] = '\0';
9025 return 1;
9026 }
9027#ifndef Py_UNICODE_WIDE
9028 if (PyUnicode_GET_SIZE(v) == 2) {
9029 /* Decode a valid surrogate pair */
9030 int c0 = PyUnicode_AS_UNICODE(v)[0];
9031 int c1 = PyUnicode_AS_UNICODE(v)[1];
9032 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9033 0xDC00 <= c1 && c1 <= 0xDFFF) {
9034 buf[0] = c0;
9035 buf[1] = c1;
9036 buf[2] = '\0';
9037 return 2;
9038 }
9039 }
9040#endif
9041 goto onError;
9042 }
9043 else {
9044 /* Integer input truncated to a character */
9045 long x;
9046 x = PyLong_AsLong(v);
9047 if (x == -1 && PyErr_Occurred())
9048 goto onError;
9049
9050 if (x < 0 || x > 0x10ffff) {
9051 PyErr_SetString(PyExc_OverflowError,
9052 "%c arg not in range(0x110000)");
9053 return -1;
9054 }
9055
9056#ifndef Py_UNICODE_WIDE
9057 if (x > 0xffff) {
9058 x -= 0x10000;
9059 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9060 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9061 return 2;
9062 }
9063#endif
9064 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009065 buf[1] = '\0';
9066 return 1;
9067 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009068
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009070 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009072 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073}
9074
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009075/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009076 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009077*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009078#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009079
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082{
9083 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009084 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 int args_owned = 0;
9086 PyUnicodeObject *result = NULL;
9087 PyObject *dict = NULL;
9088 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009089
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 PyErr_BadInternalCall();
9092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 }
9094 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009095 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 fmt = PyUnicode_AS_UNICODE(uformat);
9098 fmtcnt = PyUnicode_GET_SIZE(uformat);
9099
9100 reslen = rescnt = fmtcnt + 100;
9101 result = _PyUnicode_New(reslen);
9102 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 res = PyUnicode_AS_UNICODE(result);
9105
9106 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 arglen = PyTuple_Size(args);
9108 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 }
9110 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 arglen = -1;
9112 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009114 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009115 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117
9118 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 if (*fmt != '%') {
9120 if (--rescnt < 0) {
9121 rescnt = fmtcnt + 100;
9122 reslen += rescnt;
9123 if (_PyUnicode_Resize(&result, reslen) < 0)
9124 goto onError;
9125 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9126 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 }
9130 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 /* Got a format specifier */
9132 int flags = 0;
9133 Py_ssize_t width = -1;
9134 int prec = -1;
9135 Py_UNICODE c = '\0';
9136 Py_UNICODE fill;
9137 int isnumok;
9138 PyObject *v = NULL;
9139 PyObject *temp = NULL;
9140 Py_UNICODE *pbuf;
9141 Py_UNICODE sign;
9142 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009143 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 fmt++;
9146 if (*fmt == '(') {
9147 Py_UNICODE *keystart;
9148 Py_ssize_t keylen;
9149 PyObject *key;
9150 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009151
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 if (dict == NULL) {
9153 PyErr_SetString(PyExc_TypeError,
9154 "format requires a mapping");
9155 goto onError;
9156 }
9157 ++fmt;
9158 --fmtcnt;
9159 keystart = fmt;
9160 /* Skip over balanced parentheses */
9161 while (pcount > 0 && --fmtcnt >= 0) {
9162 if (*fmt == ')')
9163 --pcount;
9164 else if (*fmt == '(')
9165 ++pcount;
9166 fmt++;
9167 }
9168 keylen = fmt - keystart - 1;
9169 if (fmtcnt < 0 || pcount > 0) {
9170 PyErr_SetString(PyExc_ValueError,
9171 "incomplete format key");
9172 goto onError;
9173 }
9174#if 0
9175 /* keys are converted to strings using UTF-8 and
9176 then looked up since Python uses strings to hold
9177 variables names etc. in its namespaces and we
9178 wouldn't want to break common idioms. */
9179 key = PyUnicode_EncodeUTF8(keystart,
9180 keylen,
9181 NULL);
9182#else
9183 key = PyUnicode_FromUnicode(keystart, keylen);
9184#endif
9185 if (key == NULL)
9186 goto onError;
9187 if (args_owned) {
9188 Py_DECREF(args);
9189 args_owned = 0;
9190 }
9191 args = PyObject_GetItem(dict, key);
9192 Py_DECREF(key);
9193 if (args == NULL) {
9194 goto onError;
9195 }
9196 args_owned = 1;
9197 arglen = -1;
9198 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 while (--fmtcnt >= 0) {
9201 switch (c = *fmt++) {
9202 case '-': flags |= F_LJUST; continue;
9203 case '+': flags |= F_SIGN; continue;
9204 case ' ': flags |= F_BLANK; continue;
9205 case '#': flags |= F_ALT; continue;
9206 case '0': flags |= F_ZERO; continue;
9207 }
9208 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 if (c == '*') {
9211 v = getnextarg(args, arglen, &argidx);
9212 if (v == NULL)
9213 goto onError;
9214 if (!PyLong_Check(v)) {
9215 PyErr_SetString(PyExc_TypeError,
9216 "* wants int");
9217 goto onError;
9218 }
9219 width = PyLong_AsLong(v);
9220 if (width == -1 && PyErr_Occurred())
9221 goto onError;
9222 if (width < 0) {
9223 flags |= F_LJUST;
9224 width = -width;
9225 }
9226 if (--fmtcnt >= 0)
9227 c = *fmt++;
9228 }
9229 else if (c >= '0' && c <= '9') {
9230 width = c - '0';
9231 while (--fmtcnt >= 0) {
9232 c = *fmt++;
9233 if (c < '0' || c > '9')
9234 break;
9235 if ((width*10) / 10 != width) {
9236 PyErr_SetString(PyExc_ValueError,
9237 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009238 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 }
9240 width = width*10 + (c - '0');
9241 }
9242 }
9243 if (c == '.') {
9244 prec = 0;
9245 if (--fmtcnt >= 0)
9246 c = *fmt++;
9247 if (c == '*') {
9248 v = getnextarg(args, arglen, &argidx);
9249 if (v == NULL)
9250 goto onError;
9251 if (!PyLong_Check(v)) {
9252 PyErr_SetString(PyExc_TypeError,
9253 "* wants int");
9254 goto onError;
9255 }
9256 prec = PyLong_AsLong(v);
9257 if (prec == -1 && PyErr_Occurred())
9258 goto onError;
9259 if (prec < 0)
9260 prec = 0;
9261 if (--fmtcnt >= 0)
9262 c = *fmt++;
9263 }
9264 else if (c >= '0' && c <= '9') {
9265 prec = c - '0';
9266 while (--fmtcnt >= 0) {
9267 c = Py_CHARMASK(*fmt++);
9268 if (c < '0' || c > '9')
9269 break;
9270 if ((prec*10) / 10 != prec) {
9271 PyErr_SetString(PyExc_ValueError,
9272 "prec too big");
9273 goto onError;
9274 }
9275 prec = prec*10 + (c - '0');
9276 }
9277 }
9278 } /* prec */
9279 if (fmtcnt >= 0) {
9280 if (c == 'h' || c == 'l' || c == 'L') {
9281 if (--fmtcnt >= 0)
9282 c = *fmt++;
9283 }
9284 }
9285 if (fmtcnt < 0) {
9286 PyErr_SetString(PyExc_ValueError,
9287 "incomplete format");
9288 goto onError;
9289 }
9290 if (c != '%') {
9291 v = getnextarg(args, arglen, &argidx);
9292 if (v == NULL)
9293 goto onError;
9294 }
9295 sign = 0;
9296 fill = ' ';
9297 switch (c) {
9298
9299 case '%':
9300 pbuf = formatbuf;
9301 /* presume that buffer length is at least 1 */
9302 pbuf[0] = '%';
9303 len = 1;
9304 break;
9305
9306 case 's':
9307 case 'r':
9308 case 'a':
9309 if (PyUnicode_Check(v) && c == 's') {
9310 temp = v;
9311 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009312 }
9313 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009314 if (c == 's')
9315 temp = PyObject_Str(v);
9316 else if (c == 'r')
9317 temp = PyObject_Repr(v);
9318 else
9319 temp = PyObject_ASCII(v);
9320 if (temp == NULL)
9321 goto onError;
9322 if (PyUnicode_Check(temp))
9323 /* nothing to do */;
9324 else {
9325 Py_DECREF(temp);
9326 PyErr_SetString(PyExc_TypeError,
9327 "%s argument has non-string str()");
9328 goto onError;
9329 }
9330 }
9331 pbuf = PyUnicode_AS_UNICODE(temp);
9332 len = PyUnicode_GET_SIZE(temp);
9333 if (prec >= 0 && len > prec)
9334 len = prec;
9335 break;
9336
9337 case 'i':
9338 case 'd':
9339 case 'u':
9340 case 'o':
9341 case 'x':
9342 case 'X':
9343 if (c == 'i')
9344 c = 'd';
9345 isnumok = 0;
9346 if (PyNumber_Check(v)) {
9347 PyObject *iobj=NULL;
9348
9349 if (PyLong_Check(v)) {
9350 iobj = v;
9351 Py_INCREF(iobj);
9352 }
9353 else {
9354 iobj = PyNumber_Long(v);
9355 }
9356 if (iobj!=NULL) {
9357 if (PyLong_Check(iobj)) {
9358 isnumok = 1;
9359 temp = formatlong(iobj, flags, prec, c);
9360 Py_DECREF(iobj);
9361 if (!temp)
9362 goto onError;
9363 pbuf = PyUnicode_AS_UNICODE(temp);
9364 len = PyUnicode_GET_SIZE(temp);
9365 sign = 1;
9366 }
9367 else {
9368 Py_DECREF(iobj);
9369 }
9370 }
9371 }
9372 if (!isnumok) {
9373 PyErr_Format(PyExc_TypeError,
9374 "%%%c format: a number is required, "
9375 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9376 goto onError;
9377 }
9378 if (flags & F_ZERO)
9379 fill = '0';
9380 break;
9381
9382 case 'e':
9383 case 'E':
9384 case 'f':
9385 case 'F':
9386 case 'g':
9387 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009388 temp = formatfloat(v, flags, prec, c);
9389 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009391 pbuf = PyUnicode_AS_UNICODE(temp);
9392 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 sign = 1;
9394 if (flags & F_ZERO)
9395 fill = '0';
9396 break;
9397
9398 case 'c':
9399 pbuf = formatbuf;
9400 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9401 if (len < 0)
9402 goto onError;
9403 break;
9404
9405 default:
9406 PyErr_Format(PyExc_ValueError,
9407 "unsupported format character '%c' (0x%x) "
9408 "at index %zd",
9409 (31<=c && c<=126) ? (char)c : '?',
9410 (int)c,
9411 (Py_ssize_t)(fmt - 1 -
9412 PyUnicode_AS_UNICODE(uformat)));
9413 goto onError;
9414 }
9415 if (sign) {
9416 if (*pbuf == '-' || *pbuf == '+') {
9417 sign = *pbuf++;
9418 len--;
9419 }
9420 else if (flags & F_SIGN)
9421 sign = '+';
9422 else if (flags & F_BLANK)
9423 sign = ' ';
9424 else
9425 sign = 0;
9426 }
9427 if (width < len)
9428 width = len;
9429 if (rescnt - (sign != 0) < width) {
9430 reslen -= rescnt;
9431 rescnt = width + fmtcnt + 100;
9432 reslen += rescnt;
9433 if (reslen < 0) {
9434 Py_XDECREF(temp);
9435 PyErr_NoMemory();
9436 goto onError;
9437 }
9438 if (_PyUnicode_Resize(&result, reslen) < 0) {
9439 Py_XDECREF(temp);
9440 goto onError;
9441 }
9442 res = PyUnicode_AS_UNICODE(result)
9443 + reslen - rescnt;
9444 }
9445 if (sign) {
9446 if (fill != ' ')
9447 *res++ = sign;
9448 rescnt--;
9449 if (width > len)
9450 width--;
9451 }
9452 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9453 assert(pbuf[0] == '0');
9454 assert(pbuf[1] == c);
9455 if (fill != ' ') {
9456 *res++ = *pbuf++;
9457 *res++ = *pbuf++;
9458 }
9459 rescnt -= 2;
9460 width -= 2;
9461 if (width < 0)
9462 width = 0;
9463 len -= 2;
9464 }
9465 if (width > len && !(flags & F_LJUST)) {
9466 do {
9467 --rescnt;
9468 *res++ = fill;
9469 } while (--width > len);
9470 }
9471 if (fill == ' ') {
9472 if (sign)
9473 *res++ = sign;
9474 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9475 assert(pbuf[0] == '0');
9476 assert(pbuf[1] == c);
9477 *res++ = *pbuf++;
9478 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009479 }
9480 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 Py_UNICODE_COPY(res, pbuf, len);
9482 res += len;
9483 rescnt -= len;
9484 while (--width >= len) {
9485 --rescnt;
9486 *res++ = ' ';
9487 }
9488 if (dict && (argidx < arglen) && c != '%') {
9489 PyErr_SetString(PyExc_TypeError,
9490 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009491 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 goto onError;
9493 }
9494 Py_XDECREF(temp);
9495 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 } /* until end */
9497 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 PyErr_SetString(PyExc_TypeError,
9499 "not all arguments converted during string formatting");
9500 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
9502
Thomas Woutersa96affe2006-03-12 00:29:36 +00009503 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
9508 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 return (PyObject *)result;
9510
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 Py_XDECREF(result);
9513 Py_DECREF(uformat);
9514 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
9517 return NULL;
9518}
9519
Jeremy Hylton938ace62002-07-17 16:30:39 +00009520static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009521unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9522
Tim Peters6d6c1a32001-08-02 04:15:00 +00009523static PyObject *
9524unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9525{
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009527 static char *kwlist[] = {"object", "encoding", "errors", 0};
9528 char *encoding = NULL;
9529 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009530
Benjamin Peterson14339b62009-01-31 16:36:08 +00009531 if (type != &PyUnicode_Type)
9532 return unicode_subtype_new(type, args, kwds);
9533 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009535 return NULL;
9536 if (x == NULL)
9537 return (PyObject *)_PyUnicode_New(0);
9538 if (encoding == NULL && errors == NULL)
9539 return PyObject_Str(x);
9540 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009542}
9543
Guido van Rossume023fe02001-08-30 03:12:59 +00009544static PyObject *
9545unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9546{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009547 PyUnicodeObject *tmp, *pnew;
9548 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009549
Benjamin Peterson14339b62009-01-31 16:36:08 +00009550 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9551 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9552 if (tmp == NULL)
9553 return NULL;
9554 assert(PyUnicode_Check(tmp));
9555 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9556 if (pnew == NULL) {
9557 Py_DECREF(tmp);
9558 return NULL;
9559 }
9560 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9561 if (pnew->str == NULL) {
9562 _Py_ForgetReference((PyObject *)pnew);
9563 PyObject_Del(pnew);
9564 Py_DECREF(tmp);
9565 return PyErr_NoMemory();
9566 }
9567 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9568 pnew->length = n;
9569 pnew->hash = tmp->hash;
9570 Py_DECREF(tmp);
9571 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009572}
9573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009574PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009576\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009577Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009578encoding defaults to the current default string encoding.\n\
9579errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009580
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009581static PyObject *unicode_iter(PyObject *seq);
9582
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009584 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009585 "str", /* tp_name */
9586 sizeof(PyUnicodeObject), /* tp_size */
9587 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 (destructor)unicode_dealloc, /* tp_dealloc */
9590 0, /* tp_print */
9591 0, /* tp_getattr */
9592 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009593 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 unicode_repr, /* tp_repr */
9595 &unicode_as_number, /* tp_as_number */
9596 &unicode_as_sequence, /* tp_as_sequence */
9597 &unicode_as_mapping, /* tp_as_mapping */
9598 (hashfunc) unicode_hash, /* tp_hash*/
9599 0, /* tp_call*/
9600 (reprfunc) unicode_str, /* tp_str */
9601 PyObject_GenericGetAttr, /* tp_getattro */
9602 0, /* tp_setattro */
9603 0, /* tp_as_buffer */
9604 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009606 unicode_doc, /* tp_doc */
9607 0, /* tp_traverse */
9608 0, /* tp_clear */
9609 PyUnicode_RichCompare, /* tp_richcompare */
9610 0, /* tp_weaklistoffset */
9611 unicode_iter, /* tp_iter */
9612 0, /* tp_iternext */
9613 unicode_methods, /* tp_methods */
9614 0, /* tp_members */
9615 0, /* tp_getset */
9616 &PyBaseObject_Type, /* tp_base */
9617 0, /* tp_dict */
9618 0, /* tp_descr_get */
9619 0, /* tp_descr_set */
9620 0, /* tp_dictoffset */
9621 0, /* tp_init */
9622 0, /* tp_alloc */
9623 unicode_new, /* tp_new */
9624 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625};
9626
9627/* Initialize the Unicode implementation */
9628
Thomas Wouters78890102000-07-22 19:25:51 +00009629void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009631 int i;
9632
Thomas Wouters477c8d52006-05-27 19:21:47 +00009633 /* XXX - move this array to unicodectype.c ? */
9634 Py_UNICODE linebreak[] = {
9635 0x000A, /* LINE FEED */
9636 0x000D, /* CARRIAGE RETURN */
9637 0x001C, /* FILE SEPARATOR */
9638 0x001D, /* GROUP SEPARATOR */
9639 0x001E, /* RECORD SEPARATOR */
9640 0x0085, /* NEXT LINE */
9641 0x2028, /* LINE SEPARATOR */
9642 0x2029, /* PARAGRAPH SEPARATOR */
9643 };
9644
Fred Drakee4315f52000-05-09 19:53:39 +00009645 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009646 free_list = NULL;
9647 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009649 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009651
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009652 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009654 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009656
9657 /* initialize the linebreak bloom filter */
9658 bloom_linebreak = make_bloom_mask(
9659 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9660 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009661
9662 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
9665/* Finalize the Unicode implementation */
9666
Christian Heimesa156e092008-02-16 07:38:31 +00009667int
9668PyUnicode_ClearFreeList(void)
9669{
9670 int freelist_size = numfree;
9671 PyUnicodeObject *u;
9672
9673 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 PyUnicodeObject *v = u;
9675 u = *(PyUnicodeObject **)u;
9676 if (v->str)
9677 PyObject_DEL(v->str);
9678 Py_XDECREF(v->defenc);
9679 PyObject_Del(v);
9680 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009681 }
9682 free_list = NULL;
9683 assert(numfree == 0);
9684 return freelist_size;
9685}
9686
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687void
Thomas Wouters78890102000-07-22 19:25:51 +00009688_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009690 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009692 Py_XDECREF(unicode_empty);
9693 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009694
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009695 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 if (unicode_latin1[i]) {
9697 Py_DECREF(unicode_latin1[i]);
9698 unicode_latin1[i] = NULL;
9699 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009700 }
Christian Heimesa156e092008-02-16 07:38:31 +00009701 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009703
Walter Dörwald16807132007-05-25 13:52:07 +00009704void
9705PyUnicode_InternInPlace(PyObject **p)
9706{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9708 PyObject *t;
9709 if (s == NULL || !PyUnicode_Check(s))
9710 Py_FatalError(
9711 "PyUnicode_InternInPlace: unicode strings only please!");
9712 /* If it's a subclass, we don't really know what putting
9713 it in the interned dict might do. */
9714 if (!PyUnicode_CheckExact(s))
9715 return;
9716 if (PyUnicode_CHECK_INTERNED(s))
9717 return;
9718 if (interned == NULL) {
9719 interned = PyDict_New();
9720 if (interned == NULL) {
9721 PyErr_Clear(); /* Don't leave an exception */
9722 return;
9723 }
9724 }
9725 /* It might be that the GetItem call fails even
9726 though the key is present in the dictionary,
9727 namely when this happens during a stack overflow. */
9728 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009729 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009731
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 if (t) {
9733 Py_INCREF(t);
9734 Py_DECREF(*p);
9735 *p = t;
9736 return;
9737 }
Walter Dörwald16807132007-05-25 13:52:07 +00009738
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 PyThreadState_GET()->recursion_critical = 1;
9740 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9741 PyErr_Clear();
9742 PyThreadState_GET()->recursion_critical = 0;
9743 return;
9744 }
9745 PyThreadState_GET()->recursion_critical = 0;
9746 /* The two references in interned are not counted by refcnt.
9747 The deallocator will take care of this */
9748 Py_REFCNT(s) -= 2;
9749 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009750}
9751
9752void
9753PyUnicode_InternImmortal(PyObject **p)
9754{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009755 PyUnicode_InternInPlace(p);
9756 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9757 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9758 Py_INCREF(*p);
9759 }
Walter Dörwald16807132007-05-25 13:52:07 +00009760}
9761
9762PyObject *
9763PyUnicode_InternFromString(const char *cp)
9764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 PyObject *s = PyUnicode_FromString(cp);
9766 if (s == NULL)
9767 return NULL;
9768 PyUnicode_InternInPlace(&s);
9769 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009770}
9771
9772void _Py_ReleaseInternedUnicodeStrings(void)
9773{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009774 PyObject *keys;
9775 PyUnicodeObject *s;
9776 Py_ssize_t i, n;
9777 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009778
Benjamin Peterson14339b62009-01-31 16:36:08 +00009779 if (interned == NULL || !PyDict_Check(interned))
9780 return;
9781 keys = PyDict_Keys(interned);
9782 if (keys == NULL || !PyList_Check(keys)) {
9783 PyErr_Clear();
9784 return;
9785 }
Walter Dörwald16807132007-05-25 13:52:07 +00009786
Benjamin Peterson14339b62009-01-31 16:36:08 +00009787 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9788 detector, interned unicode strings are not forcibly deallocated;
9789 rather, we give them their stolen references back, and then clear
9790 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009791
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792 n = PyList_GET_SIZE(keys);
9793 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009794 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009795 for (i = 0; i < n; i++) {
9796 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9797 switch (s->state) {
9798 case SSTATE_NOT_INTERNED:
9799 /* XXX Shouldn't happen */
9800 break;
9801 case SSTATE_INTERNED_IMMORTAL:
9802 Py_REFCNT(s) += 1;
9803 immortal_size += s->length;
9804 break;
9805 case SSTATE_INTERNED_MORTAL:
9806 Py_REFCNT(s) += 2;
9807 mortal_size += s->length;
9808 break;
9809 default:
9810 Py_FatalError("Inconsistent interned string state.");
9811 }
9812 s->state = SSTATE_NOT_INTERNED;
9813 }
9814 fprintf(stderr, "total size of all interned strings: "
9815 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9816 "mortal/immortal\n", mortal_size, immortal_size);
9817 Py_DECREF(keys);
9818 PyDict_Clear(interned);
9819 Py_DECREF(interned);
9820 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009821}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009822
9823
9824/********************* Unicode Iterator **************************/
9825
9826typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009827 PyObject_HEAD
9828 Py_ssize_t it_index;
9829 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009830} unicodeiterobject;
9831
9832static void
9833unicodeiter_dealloc(unicodeiterobject *it)
9834{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835 _PyObject_GC_UNTRACK(it);
9836 Py_XDECREF(it->it_seq);
9837 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009838}
9839
9840static int
9841unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9842{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009843 Py_VISIT(it->it_seq);
9844 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009845}
9846
9847static PyObject *
9848unicodeiter_next(unicodeiterobject *it)
9849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009850 PyUnicodeObject *seq;
9851 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009852
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 assert(it != NULL);
9854 seq = it->it_seq;
9855 if (seq == NULL)
9856 return NULL;
9857 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009858
Benjamin Peterson14339b62009-01-31 16:36:08 +00009859 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9860 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009861 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009862 if (item != NULL)
9863 ++it->it_index;
9864 return item;
9865 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009866
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867 Py_DECREF(seq);
9868 it->it_seq = NULL;
9869 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009870}
9871
9872static PyObject *
9873unicodeiter_len(unicodeiterobject *it)
9874{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009875 Py_ssize_t len = 0;
9876 if (it->it_seq)
9877 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9878 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009879}
9880
9881PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9882
9883static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009885 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009887};
9888
9889PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9891 "str_iterator", /* tp_name */
9892 sizeof(unicodeiterobject), /* tp_basicsize */
9893 0, /* tp_itemsize */
9894 /* methods */
9895 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9896 0, /* tp_print */
9897 0, /* tp_getattr */
9898 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009899 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009900 0, /* tp_repr */
9901 0, /* tp_as_number */
9902 0, /* tp_as_sequence */
9903 0, /* tp_as_mapping */
9904 0, /* tp_hash */
9905 0, /* tp_call */
9906 0, /* tp_str */
9907 PyObject_GenericGetAttr, /* tp_getattro */
9908 0, /* tp_setattro */
9909 0, /* tp_as_buffer */
9910 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9911 0, /* tp_doc */
9912 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9913 0, /* tp_clear */
9914 0, /* tp_richcompare */
9915 0, /* tp_weaklistoffset */
9916 PyObject_SelfIter, /* tp_iter */
9917 (iternextfunc)unicodeiter_next, /* tp_iternext */
9918 unicodeiter_methods, /* tp_methods */
9919 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009920};
9921
9922static PyObject *
9923unicode_iter(PyObject *seq)
9924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009925 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009926
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 if (!PyUnicode_Check(seq)) {
9928 PyErr_BadInternalCall();
9929 return NULL;
9930 }
9931 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9932 if (it == NULL)
9933 return NULL;
9934 it->it_index = 0;
9935 Py_INCREF(seq);
9936 it->it_seq = (PyUnicodeObject *)seq;
9937 _PyObject_GC_TRACK(it);
9938 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009939}
9940
Martin v. Löwis5b222132007-06-10 09:51:05 +00009941size_t
9942Py_UNICODE_strlen(const Py_UNICODE *u)
9943{
9944 int res = 0;
9945 while(*u++)
9946 res++;
9947 return res;
9948}
9949
9950Py_UNICODE*
9951Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9952{
9953 Py_UNICODE *u = s1;
9954 while ((*u++ = *s2++));
9955 return s1;
9956}
9957
9958Py_UNICODE*
9959Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9960{
9961 Py_UNICODE *u = s1;
9962 while ((*u++ = *s2++))
9963 if (n-- == 0)
9964 break;
9965 return s1;
9966}
9967
9968int
9969Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9970{
9971 while (*s1 && *s2 && *s1 == *s2)
9972 s1++, s2++;
9973 if (*s1 && *s2)
9974 return (*s1 < *s2) ? -1 : +1;
9975 if (*s1)
9976 return 1;
9977 if (*s2)
9978 return -1;
9979 return 0;
9980}
9981
9982Py_UNICODE*
9983Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9984{
9985 const Py_UNICODE *p;
9986 for (p = s; *p; p++)
9987 if (*p == c)
9988 return (Py_UNICODE*)p;
9989 return NULL;
9990}
9991
9992
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009993#ifdef __cplusplus
9994}
9995#endif
9996
9997
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009998/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 Local variables:
10000 c-basic-offset: 4
10001 indent-tabs-mode: nil
10002 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010003*/