blob: c9ff05979264de367b5f518a6a207513fb804cec [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000794 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* Convert the argument to a bytes object, according to the file
1534 system encoding */
1535
1536int
1537PyUnicode_FSConverter(PyObject* arg, void* addr)
1538{
1539 PyObject *output = NULL;
1540 Py_ssize_t size;
1541 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001542 if (arg == NULL) {
1543 Py_DECREF(*(PyObject**)addr);
1544 return 1;
1545 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001546 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1547 output = arg;
1548 Py_INCREF(output);
1549 }
1550 else {
1551 arg = PyUnicode_FromObject(arg);
1552 if (!arg)
1553 return 0;
1554 output = PyUnicode_AsEncodedObject(arg,
1555 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001556 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001557 Py_DECREF(arg);
1558 if (!output)
1559 return 0;
1560 if (!PyBytes_Check(output)) {
1561 Py_DECREF(output);
1562 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1563 return 0;
1564 }
1565 }
1566 if (PyBytes_Check(output)) {
1567 size = PyBytes_GET_SIZE(output);
1568 data = PyBytes_AS_STRING(output);
1569 }
1570 else {
1571 size = PyByteArray_GET_SIZE(output);
1572 data = PyByteArray_AS_STRING(output);
1573 }
1574 if (size != strlen(data)) {
1575 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1576 Py_DECREF(output);
1577 return 0;
1578 }
1579 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001580 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001581}
1582
1583
Martin v. Löwis5b222132007-06-10 09:51:05 +00001584char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001585_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001586{
Christian Heimesf3863112007-11-22 07:46:41 +00001587 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001588 if (!PyUnicode_Check(unicode)) {
1589 PyErr_BadArgument();
1590 return NULL;
1591 }
Christian Heimesf3863112007-11-22 07:46:41 +00001592 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1593 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001594 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001595 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001596 *psize = PyBytes_GET_SIZE(bytes);
1597 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598}
1599
1600char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001601_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001602{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001603 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001604}
1605
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1607{
1608 if (!PyUnicode_Check(unicode)) {
1609 PyErr_BadArgument();
1610 goto onError;
1611 }
1612 return PyUnicode_AS_UNICODE(unicode);
1613
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 return NULL;
1616}
1617
Martin v. Löwis18e16552006-02-15 17:27:45 +00001618Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619{
1620 if (!PyUnicode_Check(unicode)) {
1621 PyErr_BadArgument();
1622 goto onError;
1623 }
1624 return PyUnicode_GET_SIZE(unicode);
1625
Benjamin Peterson29060642009-01-31 22:14:21 +00001626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 return -1;
1628}
1629
Thomas Wouters78890102000-07-22 19:25:51 +00001630const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001631{
1632 return unicode_default_encoding;
1633}
1634
1635int PyUnicode_SetDefaultEncoding(const char *encoding)
1636{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001637 if (strcmp(encoding, unicode_default_encoding) != 0) {
1638 PyErr_Format(PyExc_ValueError,
1639 "Can only set default encoding to %s",
1640 unicode_default_encoding);
1641 return -1;
1642 }
Fred Drakee4315f52000-05-09 19:53:39 +00001643 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001644}
1645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646/* error handling callback helper:
1647 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001648 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 and adjust various state variables.
1650 return 0 on success, -1 on error
1651*/
1652
1653static
1654int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001655 const char *encoding, const char *reason,
1656 const char **input, const char **inend, Py_ssize_t *startinpos,
1657 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1658 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001660 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661
1662 PyObject *restuple = NULL;
1663 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001664 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001665 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t requiredsize;
1667 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001669 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001670 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 int res = -1;
1672
1673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 *errorHandler = PyCodec_LookupError(errors);
1675 if (*errorHandler == NULL)
1676 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 }
1678
1679 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1682 if (*exceptionObject == NULL)
1683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 }
1685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001686 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1687 goto onError;
1688 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1689 goto onError;
1690 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 }
1693
1694 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1695 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001696 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001698 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 }
1701 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001703
1704 /* Copy back the bytes variables, which might have been modified by the
1705 callback */
1706 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1707 if (!inputobj)
1708 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001709 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001710 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001711 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001712 *input = PyBytes_AS_STRING(inputobj);
1713 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001715 /* we can DECREF safely, as the exception has another reference,
1716 so the object won't go away. */
1717 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001720 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001721 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1723 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725
1726 /* need more space? (at least enough for what we
1727 have+the replacement+the rest of the string (starting
1728 at the new input position), so we won't have to check space
1729 when there are no errors in the rest of the string) */
1730 repptr = PyUnicode_AS_UNICODE(repunicode);
1731 repsize = PyUnicode_GET_SIZE(repunicode);
1732 requiredsize = *outpos + repsize + insize-newpos;
1733 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 if (requiredsize<2*outsize)
1735 requiredsize = 2*outsize;
1736 if (_PyUnicode_Resize(output, requiredsize) < 0)
1737 goto onError;
1738 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 }
1740 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001741 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_UNICODE_COPY(*outptr, repptr, repsize);
1743 *outptr += repsize;
1744 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 /* we made it! */
1747 res = 0;
1748
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 Py_XDECREF(restuple);
1751 return res;
1752}
1753
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754/* --- UTF-7 Codec -------------------------------------------------------- */
1755
Antoine Pitrou244651a2009-05-04 18:56:13 +00001756/* See RFC2152 for details. We encode conservatively and decode liberally. */
1757
1758/* Three simple macros defining base-64. */
1759
1760/* Is c a base-64 character? */
1761
1762#define IS_BASE64(c) \
1763 (((c) >= 'A' && (c) <= 'Z') || \
1764 ((c) >= 'a' && (c) <= 'z') || \
1765 ((c) >= '0' && (c) <= '9') || \
1766 (c) == '+' || (c) == '/')
1767
1768/* given that c is a base-64 character, what is its base-64 value? */
1769
1770#define FROM_BASE64(c) \
1771 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1772 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1773 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1774 (c) == '+' ? 62 : 63)
1775
1776/* What is the base-64 character of the bottom 6 bits of n? */
1777
1778#define TO_BASE64(n) \
1779 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1780
1781/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1782 * decoded as itself. We are permissive on decoding; the only ASCII
1783 * byte not decoding to itself is the + which begins a base64
1784 * string. */
1785
1786#define DECODE_DIRECT(c) \
1787 ((c) <= 127 && (c) != '+')
1788
1789/* The UTF-7 encoder treats ASCII characters differently according to
1790 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1791 * the above). See RFC2152. This array identifies these different
1792 * sets:
1793 * 0 : "Set D"
1794 * alphanumeric and '(),-./:?
1795 * 1 : "Set O"
1796 * !"#$%&*;<=>@[]^_`{|}
1797 * 2 : "whitespace"
1798 * ht nl cr sp
1799 * 3 : special (must be base64 encoded)
1800 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1801 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802
Tim Petersced69f82003-09-16 20:30:58 +00001803static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001804char utf7_category[128] = {
1805/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1806 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1807/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1808 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1809/* sp ! " # $ % & ' ( ) * + , - . / */
1810 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1811/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1813/* @ A B C D E F G H I J K L M N O */
1814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1815/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1817/* ` a b c d e f g h i j k l m n o */
1818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1819/* p q r s t u v w x y z { | } ~ del */
1820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821};
1822
Antoine Pitrou244651a2009-05-04 18:56:13 +00001823/* ENCODE_DIRECT: this character should be encoded as itself. The
1824 * answer depends on whether we are encoding set O as itself, and also
1825 * on whether we are encoding whitespace as itself. RFC2152 makes it
1826 * clear that the answers to these questions vary between
1827 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001828
Antoine Pitrou244651a2009-05-04 18:56:13 +00001829#define ENCODE_DIRECT(c, directO, directWS) \
1830 ((c) < 128 && (c) > 0 && \
1831 ((utf7_category[(c)] == 0) || \
1832 (directWS && (utf7_category[(c)] == 2)) || \
1833 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 Py_ssize_t size,
1837 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001839 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1840}
1841
Antoine Pitrou244651a2009-05-04 18:56:13 +00001842/* The decoder. The only state we preserve is our read position,
1843 * i.e. how many characters we have consumed. So if we end in the
1844 * middle of a shift sequence we have to back off the read position
1845 * and the output to the beginning of the sequence, otherwise we lose
1846 * all the shift state (seen bits, number of bits seen, high
1847 * surrogate). */
1848
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001849PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001850 Py_ssize_t size,
1851 const char *errors,
1852 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001855 Py_ssize_t startinpos;
1856 Py_ssize_t endinpos;
1857 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 const char *e;
1859 PyUnicodeObject *unicode;
1860 Py_UNICODE *p;
1861 const char *errmsg = "";
1862 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001863 Py_UNICODE *shiftOutStart;
1864 unsigned int base64bits = 0;
1865 unsigned long base64buffer = 0;
1866 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 PyObject *errorHandler = NULL;
1868 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869
1870 unicode = _PyUnicode_New(size);
1871 if (!unicode)
1872 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001873 if (size == 0) {
1874 if (consumed)
1875 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001877 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878
1879 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001880 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 e = s + size;
1882
1883 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001885 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001886 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887
Antoine Pitrou244651a2009-05-04 18:56:13 +00001888 if (inShift) { /* in a base-64 section */
1889 if (IS_BASE64(ch)) { /* consume a base-64 character */
1890 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1891 base64bits += 6;
1892 s++;
1893 if (base64bits >= 16) {
1894 /* we have enough bits for a UTF-16 value */
1895 Py_UNICODE outCh = (Py_UNICODE)
1896 (base64buffer >> (base64bits-16));
1897 base64bits -= 16;
1898 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1899 if (surrogate) {
1900 /* expecting a second surrogate */
1901 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1902#ifdef Py_UNICODE_WIDE
1903 *p++ = (((surrogate & 0x3FF)<<10)
1904 | (outCh & 0x3FF)) + 0x10000;
1905#else
1906 *p++ = surrogate;
1907 *p++ = outCh;
1908#endif
1909 surrogate = 0;
1910 }
1911 else {
1912 surrogate = 0;
1913 errmsg = "second surrogate missing";
1914 goto utf7Error;
1915 }
1916 }
1917 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1918 /* first surrogate */
1919 surrogate = outCh;
1920 }
1921 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1922 errmsg = "unexpected second surrogate";
1923 goto utf7Error;
1924 }
1925 else {
1926 *p++ = outCh;
1927 }
1928 }
1929 }
1930 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001931 inShift = 0;
1932 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001933 if (surrogate) {
1934 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001937 if (base64bits > 0) { /* left-over bits */
1938 if (base64bits >= 6) {
1939 /* We've seen at least one base-64 character */
1940 errmsg = "partial character in shift sequence";
1941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001942 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001943 else {
1944 /* Some bits remain; they should be zero */
1945 if (base64buffer != 0) {
1946 errmsg = "non-zero padding bits in shift sequence";
1947 goto utf7Error;
1948 }
1949 }
1950 }
1951 if (ch != '-') {
1952 /* '-' is absorbed; other terminating
1953 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954 *p++ = ch;
1955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956 }
1957 }
1958 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960 s++; /* consume '+' */
1961 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 s++;
1963 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964 }
1965 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001966 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 shiftOutStart = p;
1968 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969 }
1970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001971 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 *p++ = ch;
1973 s++;
1974 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 else {
1976 startinpos = s-starts;
1977 s++;
1978 errmsg = "unexpected special character";
1979 goto utf7Error;
1980 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001981 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001982utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001983 outpos = p-PyUnicode_AS_UNICODE(unicode);
1984 endinpos = s-starts;
1985 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 errors, &errorHandler,
1987 "utf7", errmsg,
1988 &starts, &e, &startinpos, &endinpos, &exc, &s,
1989 &unicode, &outpos, &p))
1990 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001991 }
1992
Antoine Pitrou244651a2009-05-04 18:56:13 +00001993 /* end of string */
1994
1995 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1996 /* if we're in an inconsistent state, that's an error */
1997 if (surrogate ||
1998 (base64bits >= 6) ||
1999 (base64bits > 0 && base64buffer != 0)) {
2000 outpos = p-PyUnicode_AS_UNICODE(unicode);
2001 endinpos = size;
2002 if (unicode_decode_call_errorhandler(
2003 errors, &errorHandler,
2004 "utf7", "unterminated shift sequence",
2005 &starts, &e, &startinpos, &endinpos, &exc, &s,
2006 &unicode, &outpos, &p))
2007 goto onError;
2008 if (s < e)
2009 goto restart;
2010 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012
2013 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002014 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 if (inShift) {
2016 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 }
2019 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002024 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 goto onError;
2026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 Py_XDECREF(errorHandler);
2028 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 return (PyObject *)unicode;
2030
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 Py_XDECREF(errorHandler);
2033 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034 Py_DECREF(unicode);
2035 return NULL;
2036}
2037
2038
2039PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 int base64SetO,
2042 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002045 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002046 /* It might be possible to tighten this worst case */
Georg Brandl194da4a2009-08-13 09:34:05 +00002047 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002049 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002050 unsigned int base64bits = 0;
2051 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002052 char * out;
2053 char * start;
2054
2055 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002057
Georg Brandl194da4a2009-08-13 09:34:05 +00002058 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002059 return PyErr_NoMemory();
2060
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 if (v == NULL)
2063 return NULL;
2064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002065 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 for (;i < size; ++i) {
2067 Py_UNICODE ch = s[i];
2068
Antoine Pitrou244651a2009-05-04 18:56:13 +00002069 if (inShift) {
2070 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2071 /* shifting out */
2072 if (base64bits) { /* output remaining bits */
2073 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2074 base64buffer = 0;
2075 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002076 }
2077 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 /* Characters not in the BASE64 set implicitly unshift the sequence
2079 so no '-' is required, except if the character is itself a '-' */
2080 if (IS_BASE64(ch) || ch == '-') {
2081 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002082 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083 *out++ = (char) ch;
2084 }
2085 else {
2086 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002087 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 else { /* not in a shift sequence */
2090 if (ch == '+') {
2091 *out++ = '+';
2092 *out++ = '-';
2093 }
2094 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2095 *out++ = (char) ch;
2096 }
2097 else {
2098 *out++ = '+';
2099 inShift = 1;
2100 goto encode_char;
2101 }
2102 }
2103 continue;
2104encode_char:
2105#ifdef Py_UNICODE_WIDE
2106 if (ch >= 0x10000) {
2107 /* code first surrogate */
2108 base64bits += 16;
2109 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2110 while (base64bits >= 6) {
2111 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2112 base64bits -= 6;
2113 }
2114 /* prepare second surrogate */
2115 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2116 }
2117#endif
2118 base64bits += 16;
2119 base64buffer = (base64buffer << 16) | ch;
2120 while (base64bits >= 6) {
2121 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2122 base64bits -= 6;
2123 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002124 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002125 if (base64bits)
2126 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2127 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002129 if (_PyBytes_Resize(&v, out - start) < 0)
2130 return NULL;
2131 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132}
2133
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134#undef IS_BASE64
2135#undef FROM_BASE64
2136#undef TO_BASE64
2137#undef DECODE_DIRECT
2138#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140/* --- UTF-8 Codec -------------------------------------------------------- */
2141
Tim Petersced69f82003-09-16 20:30:58 +00002142static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143char utf8_code_length[256] = {
2144 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2145 illegal prefix. see RFC 2279 for details */
2146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2158 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2159 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2160 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2161 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2162};
2163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 Py_ssize_t size,
2166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
Walter Dörwald69652032004-09-07 20:24:22 +00002168 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2169}
2170
Antoine Pitrouab868312009-01-10 15:40:25 +00002171/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2172#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2173
2174/* Mask to quickly check whether a C 'long' contains a
2175 non-ASCII, UTF8-encoded char. */
2176#if (SIZEOF_LONG == 8)
2177# define ASCII_CHAR_MASK 0x8080808080808080L
2178#elif (SIZEOF_LONG == 4)
2179# define ASCII_CHAR_MASK 0x80808080L
2180#else
2181# error C 'long' size should be either 4 or 8!
2182#endif
2183
Walter Dörwald69652032004-09-07 20:24:22 +00002184PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002185 Py_ssize_t size,
2186 const char *errors,
2187 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002188{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002191 Py_ssize_t startinpos;
2192 Py_ssize_t endinpos;
2193 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002194 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 PyUnicodeObject *unicode;
2196 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002197 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002198 PyObject *errorHandler = NULL;
2199 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200
2201 /* Note: size will always be longer than the resulting Unicode
2202 character count */
2203 unicode = _PyUnicode_New(size);
2204 if (!unicode)
2205 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002206 if (size == 0) {
2207 if (consumed)
2208 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
2212 /* Unpack UTF-8 encoded data */
2213 p = unicode->str;
2214 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002215 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002218 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219
2220 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002221 /* Fast path for runs of ASCII characters. Given that common UTF-8
2222 input will consist of an overwhelming majority of ASCII
2223 characters, we try to optimize for this case by checking
2224 as many characters as a C 'long' can contain.
2225 First, check if we can do an aligned read, as most CPUs have
2226 a penalty for unaligned reads.
2227 */
2228 if (!((size_t) s & LONG_PTR_MASK)) {
2229 /* Help register allocation */
2230 register const char *_s = s;
2231 register Py_UNICODE *_p = p;
2232 while (_s < aligned_end) {
2233 /* Read a whole long at a time (either 4 or 8 bytes),
2234 and do a fast unrolled copy if it only contains ASCII
2235 characters. */
2236 unsigned long data = *(unsigned long *) _s;
2237 if (data & ASCII_CHAR_MASK)
2238 break;
2239 _p[0] = (unsigned char) _s[0];
2240 _p[1] = (unsigned char) _s[1];
2241 _p[2] = (unsigned char) _s[2];
2242 _p[3] = (unsigned char) _s[3];
2243#if (SIZEOF_LONG == 8)
2244 _p[4] = (unsigned char) _s[4];
2245 _p[5] = (unsigned char) _s[5];
2246 _p[6] = (unsigned char) _s[6];
2247 _p[7] = (unsigned char) _s[7];
2248#endif
2249 _s += SIZEOF_LONG;
2250 _p += SIZEOF_LONG;
2251 }
2252 s = _s;
2253 p = _p;
2254 if (s == e)
2255 break;
2256 ch = (unsigned char)*s;
2257 }
2258 }
2259
2260 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 s++;
2263 continue;
2264 }
2265
2266 n = utf8_code_length[ch];
2267
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002268 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 if (consumed)
2270 break;
2271 else {
2272 errmsg = "unexpected end of data";
2273 startinpos = s-starts;
2274 endinpos = size;
2275 goto utf8Error;
2276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278
2279 switch (n) {
2280
2281 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002282 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 startinpos = s-starts;
2284 endinpos = startinpos+1;
2285 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286
2287 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002288 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 if ((s[1] & 0xc0) != 0x80) {
2295 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002296 startinpos = s-starts;
2297 endinpos = startinpos+2;
2298 goto utf8Error;
2299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002301 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
2303 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002304 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002305 goto utf8Error;
2306 }
2307 else
2308 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 break;
2310
2311 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002312 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002313 (s[2] & 0xc0) != 0x80) {
2314 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002315 startinpos = s-starts;
2316 endinpos = startinpos+3;
2317 goto utf8Error;
2318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002320 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002321 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002322 startinpos = s-starts;
2323 endinpos = startinpos+3;
2324 goto utf8Error;
2325 }
2326 else
2327 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002328 break;
2329
2330 case 4:
2331 if ((s[1] & 0xc0) != 0x80 ||
2332 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002333 (s[3] & 0xc0) != 0x80) {
2334 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002335 startinpos = s-starts;
2336 endinpos = startinpos+4;
2337 goto utf8Error;
2338 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002339 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002341 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002342 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002344 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002345 UTF-16 */
2346 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002347 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 startinpos = s-starts;
2349 endinpos = startinpos+4;
2350 goto utf8Error;
2351 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002352#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002354#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002355 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002356
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002357 /* translate from 10000..10FFFF to 0..FFFF */
2358 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002359
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002360 /* high surrogate = top 10 bits added to D800 */
2361 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002363 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002364 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002365#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 break;
2367
2368 default:
2369 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002370 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002371 startinpos = s-starts;
2372 endinpos = startinpos+n;
2373 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 }
2375 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Benjamin Peterson29060642009-01-31 22:14:21 +00002378 utf8Error:
2379 outpos = p-PyUnicode_AS_UNICODE(unicode);
2380 if (unicode_decode_call_errorhandler(
2381 errors, &errorHandler,
2382 "utf8", errmsg,
2383 &starts, &e, &startinpos, &endinpos, &exc, &s,
2384 &unicode, &outpos, &p))
2385 goto onError;
2386 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 }
Walter Dörwald69652032004-09-07 20:24:22 +00002388 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002392 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 goto onError;
2394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002395 Py_XDECREF(errorHandler);
2396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 return (PyObject *)unicode;
2398
Benjamin Peterson29060642009-01-31 22:14:21 +00002399 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 Py_XDECREF(errorHandler);
2401 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 Py_DECREF(unicode);
2403 return NULL;
2404}
2405
Antoine Pitrouab868312009-01-10 15:40:25 +00002406#undef ASCII_CHAR_MASK
2407
2408
Tim Peters602f7402002-04-27 18:03:26 +00002409/* Allocation strategy: if the string is short, convert into a stack buffer
2410 and allocate exactly as much space needed at the end. Else allocate the
2411 maximum possible needed (4 result bytes per Unicode character), and return
2412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002414PyObject *
2415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 Py_ssize_t size,
2417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418{
Tim Peters602f7402002-04-27 18:03:26 +00002419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002420
Guido van Rossum98297ee2007-11-06 21:34:58 +00002421 Py_ssize_t i; /* index into s of next input byte */
2422 PyObject *result; /* result string object */
2423 char *p; /* next free byte in output buffer */
2424 Py_ssize_t nallocated; /* number of result bytes allocated */
2425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002427 PyObject *errorHandler = NULL;
2428 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002429
Tim Peters602f7402002-04-27 18:03:26 +00002430 assert(s != NULL);
2431 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432
Tim Peters602f7402002-04-27 18:03:26 +00002433 if (size <= MAX_SHORT_UNICHARS) {
2434 /* Write into the stack buffer; nallocated can't overflow.
2435 * At the end, we'll allocate exactly as much heap space as it
2436 * turns out we need.
2437 */
2438 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002439 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002440 p = stackbuf;
2441 }
2442 else {
2443 /* Overallocate on the heap, and give the excess back at the end. */
2444 nallocated = size * 4;
2445 if (nallocated / 4 != size) /* overflow! */
2446 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002447 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002448 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002449 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002450 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002451 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002452
Tim Peters602f7402002-04-27 18:03:26 +00002453 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002454 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002455
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002456 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002457 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002459
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002461 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002462 *p++ = (char)(0xc0 | (ch >> 6));
2463 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002464 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002465 else {
Tim Peters602f7402002-04-27 18:03:26 +00002466 /* Encode UCS2 Unicode ordinals */
2467 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002468#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002469 /* Special case: check for high surrogate */
2470 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2471 Py_UCS4 ch2 = s[i];
2472 /* Check for low surrogate and combine the two to
2473 form a UCS4 value */
2474 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002475 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002476 i++;
2477 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002478 }
Tim Peters602f7402002-04-27 18:03:26 +00002479 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002480 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002481#endif
2482 if (ch >= 0xd800 && ch <= 0xdfff) {
2483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 char *prep;
2486 int k;
2487 rep = unicode_encode_call_errorhandler
2488 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2489 s, size, &exc, i-1, i, &newpos);
2490 if (!rep)
2491 goto error;
2492 /* Implementation limitations: only support error handler that return
2493 bytes, and only support up to four replacement bytes. */
2494 if (!PyBytes_Check(rep)) {
2495 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2496 Py_DECREF(rep);
2497 goto error;
2498 }
2499 if (PyBytes_Size(rep) > 4) {
2500 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2501 Py_DECREF(rep);
2502 goto error;
2503 }
2504 prep = PyBytes_AsString(rep);
2505 for(k = PyBytes_Size(rep); k > 0; k--)
2506 *p++ = *prep++;
2507 Py_DECREF(rep);
2508 continue;
2509
2510 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002511 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002512 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2513 *p++ = (char)(0x80 | (ch & 0x3f));
2514 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 }
Victor Stinnerd526c7c2010-03-23 11:43:20 +00002516#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002517 encodeUCS4:
Victor Stinnerd526c7c2010-03-23 11:43:20 +00002518#endif
Tim Peters602f7402002-04-27 18:03:26 +00002519 /* Encode UCS4 Unicode ordinals */
2520 *p++ = (char)(0xf0 | (ch >> 18));
2521 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2522 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2523 *p++ = (char)(0x80 | (ch & 0x3f));
2524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002526
Guido van Rossum98297ee2007-11-06 21:34:58 +00002527 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002528 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002529 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002530 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
2533 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002534 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002535 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002536 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002537 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002538 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002539 Py_XDECREF(errorHandler);
2540 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002541 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002542 error:
2543 Py_XDECREF(errorHandler);
2544 Py_XDECREF(exc);
2545 Py_XDECREF(result);
2546 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002547
Tim Peters602f7402002-04-27 18:03:26 +00002548#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549}
2550
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2552{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (!PyUnicode_Check(unicode)) {
2554 PyErr_BadArgument();
2555 return NULL;
2556 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002557 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002558 PyUnicode_GET_SIZE(unicode),
2559 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560}
2561
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562/* --- UTF-32 Codec ------------------------------------------------------- */
2563
2564PyObject *
2565PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002566 Py_ssize_t size,
2567 const char *errors,
2568 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002569{
2570 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2571}
2572
2573PyObject *
2574PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002575 Py_ssize_t size,
2576 const char *errors,
2577 int *byteorder,
2578 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002579{
2580 const char *starts = s;
2581 Py_ssize_t startinpos;
2582 Py_ssize_t endinpos;
2583 Py_ssize_t outpos;
2584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
2586#ifndef Py_UNICODE_WIDE
2587 int i, pairs;
2588#else
2589 const int pairs = 0;
2590#endif
2591 const unsigned char *q, *e;
2592 int bo = 0; /* assume native ordering by default */
2593 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002594 /* Offsets from q for retrieving bytes in the right order. */
2595#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2596 int iorder[] = {0, 1, 2, 3};
2597#else
2598 int iorder[] = {3, 2, 1, 0};
2599#endif
2600 PyObject *errorHandler = NULL;
2601 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002602 /* On narrow builds we split characters outside the BMP into two
2603 codepoints => count how much extra space we need. */
2604#ifndef Py_UNICODE_WIDE
2605 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 if (((Py_UCS4 *)s)[i] >= 0x10000)
2607 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002608#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002609
2610 /* This might be one to much, because of a BOM */
2611 unicode = _PyUnicode_New((size+3)/4+pairs);
2612 if (!unicode)
2613 return NULL;
2614 if (size == 0)
2615 return (PyObject *)unicode;
2616
2617 /* Unpack UTF-32 encoded data */
2618 p = unicode->str;
2619 q = (unsigned char *)s;
2620 e = q + size;
2621
2622 if (byteorder)
2623 bo = *byteorder;
2624
2625 /* Check for BOM marks (U+FEFF) in the input and adjust current
2626 byte order setting accordingly. In native mode, the leading BOM
2627 mark is skipped, in all other modes, it is copied to the output
2628 stream as-is (giving a ZWNBSP character). */
2629 if (bo == 0) {
2630 if (size >= 4) {
2631 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002633#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 if (bom == 0x0000FEFF) {
2635 q += 4;
2636 bo = -1;
2637 }
2638 else if (bom == 0xFFFE0000) {
2639 q += 4;
2640 bo = 1;
2641 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002642#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 if (bom == 0x0000FEFF) {
2644 q += 4;
2645 bo = 1;
2646 }
2647 else if (bom == 0xFFFE0000) {
2648 q += 4;
2649 bo = -1;
2650 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002651#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002653 }
2654
2655 if (bo == -1) {
2656 /* force LE */
2657 iorder[0] = 0;
2658 iorder[1] = 1;
2659 iorder[2] = 2;
2660 iorder[3] = 3;
2661 }
2662 else if (bo == 1) {
2663 /* force BE */
2664 iorder[0] = 3;
2665 iorder[1] = 2;
2666 iorder[2] = 1;
2667 iorder[3] = 0;
2668 }
2669
2670 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 Py_UCS4 ch;
2672 /* remaining bytes at the end? (size should be divisible by 4) */
2673 if (e-q<4) {
2674 if (consumed)
2675 break;
2676 errmsg = "truncated data";
2677 startinpos = ((const char *)q)-starts;
2678 endinpos = ((const char *)e)-starts;
2679 goto utf32Error;
2680 /* The remaining input chars are ignored if the callback
2681 chooses to skip the input */
2682 }
2683 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2684 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002685
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 if (ch >= 0x110000)
2687 {
2688 errmsg = "codepoint not in range(0x110000)";
2689 startinpos = ((const char *)q)-starts;
2690 endinpos = startinpos+4;
2691 goto utf32Error;
2692 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002693#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 if (ch >= 0x10000)
2695 {
2696 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2697 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2698 }
2699 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002700#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 *p++ = ch;
2702 q += 4;
2703 continue;
2704 utf32Error:
2705 outpos = p-PyUnicode_AS_UNICODE(unicode);
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "utf32", errmsg,
2709 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2710 &unicode, &outpos, &p))
2711 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002712 }
2713
2714 if (byteorder)
2715 *byteorder = bo;
2716
2717 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719
2720 /* Adjust length */
2721 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2722 goto onError;
2723
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return (PyObject *)unicode;
2727
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002729 Py_DECREF(unicode);
2730 Py_XDECREF(errorHandler);
2731 Py_XDECREF(exc);
2732 return NULL;
2733}
2734
2735PyObject *
2736PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 Py_ssize_t size,
2738 const char *errors,
2739 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002741 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002743 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002744#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002745 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002746#else
2747 const int pairs = 0;
2748#endif
2749 /* Offsets from p for storing byte pairs in the right order. */
2750#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2751 int iorder[] = {0, 1, 2, 3};
2752#else
2753 int iorder[] = {3, 2, 1, 0};
2754#endif
2755
Benjamin Peterson29060642009-01-31 22:14:21 +00002756#define STORECHAR(CH) \
2757 do { \
2758 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2759 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2760 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2761 p[iorder[0]] = (CH) & 0xff; \
2762 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763 } while(0)
2764
2765 /* In narrow builds we can output surrogate pairs as one codepoint,
2766 so we need less space. */
2767#ifndef Py_UNICODE_WIDE
2768 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2770 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2771 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002773 nsize = (size - pairs + (byteorder == 0));
2774 bytesize = nsize * 4;
2775 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002777 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778 if (v == NULL)
2779 return NULL;
2780
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002781 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002785 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002786
2787 if (byteorder == -1) {
2788 /* force LE */
2789 iorder[0] = 0;
2790 iorder[1] = 1;
2791 iorder[2] = 2;
2792 iorder[3] = 3;
2793 }
2794 else if (byteorder == 1) {
2795 /* force BE */
2796 iorder[0] = 3;
2797 iorder[1] = 2;
2798 iorder[2] = 1;
2799 iorder[3] = 0;
2800 }
2801
2802 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002804#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2806 Py_UCS4 ch2 = *s;
2807 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2808 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2809 s++;
2810 size--;
2811 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002813#endif
2814 STORECHAR(ch);
2815 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002816
2817 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002818 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819#undef STORECHAR
2820}
2821
2822PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 PyUnicode_GET_SIZE(unicode),
2830 NULL,
2831 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002832}
2833
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834/* --- UTF-16 Codec ------------------------------------------------------- */
2835
Tim Peters772747b2001-08-09 22:21:55 +00002836PyObject *
2837PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 Py_ssize_t size,
2839 const char *errors,
2840 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841{
Walter Dörwald69652032004-09-07 20:24:22 +00002842 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2843}
2844
Antoine Pitrouab868312009-01-10 15:40:25 +00002845/* Two masks for fast checking of whether a C 'long' may contain
2846 UTF16-encoded surrogate characters. This is an efficient heuristic,
2847 assuming that non-surrogate characters with a code point >= 0x8000 are
2848 rare in most input.
2849 FAST_CHAR_MASK is used when the input is in native byte ordering,
2850 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002851*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002852#if (SIZEOF_LONG == 8)
2853# define FAST_CHAR_MASK 0x8000800080008000L
2854# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2855#elif (SIZEOF_LONG == 4)
2856# define FAST_CHAR_MASK 0x80008000L
2857# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2858#else
2859# error C 'long' size should be either 4 or 8!
2860#endif
2861
Walter Dörwald69652032004-09-07 20:24:22 +00002862PyObject *
2863PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 Py_ssize_t size,
2865 const char *errors,
2866 int *byteorder,
2867 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002869 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002870 Py_ssize_t startinpos;
2871 Py_ssize_t endinpos;
2872 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 PyUnicodeObject *unicode;
2874 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002875 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002876 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002877 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002878 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002879 /* Offsets from q for retrieving byte pairs in the right order. */
2880#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2881 int ihi = 1, ilo = 0;
2882#else
2883 int ihi = 0, ilo = 1;
2884#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885 PyObject *errorHandler = NULL;
2886 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 /* Note: size will always be longer than the resulting Unicode
2889 character count */
2890 unicode = _PyUnicode_New(size);
2891 if (!unicode)
2892 return NULL;
2893 if (size == 0)
2894 return (PyObject *)unicode;
2895
2896 /* Unpack UTF-16 encoded data */
2897 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002898 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002899 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
2901 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002902 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002904 /* Check for BOM marks (U+FEFF) in the input and adjust current
2905 byte order setting accordingly. In native mode, the leading BOM
2906 mark is skipped, in all other modes, it is copied to the output
2907 stream as-is (giving a ZWNBSP character). */
2908 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002909 if (size >= 2) {
2910 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002911#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 if (bom == 0xFEFF) {
2913 q += 2;
2914 bo = -1;
2915 }
2916 else if (bom == 0xFFFE) {
2917 q += 2;
2918 bo = 1;
2919 }
Tim Petersced69f82003-09-16 20:30:58 +00002920#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 if (bom == 0xFEFF) {
2922 q += 2;
2923 bo = 1;
2924 }
2925 else if (bom == 0xFFFE) {
2926 q += 2;
2927 bo = -1;
2928 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002929#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Tim Peters772747b2001-08-09 22:21:55 +00002933 if (bo == -1) {
2934 /* force LE */
2935 ihi = 1;
2936 ilo = 0;
2937 }
2938 else if (bo == 1) {
2939 /* force BE */
2940 ihi = 0;
2941 ilo = 1;
2942 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002943#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2944 native_ordering = ilo < ihi;
2945#else
2946 native_ordering = ilo > ihi;
2947#endif
Tim Peters772747b2001-08-09 22:21:55 +00002948
Antoine Pitrouab868312009-01-10 15:40:25 +00002949 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002950 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002952 /* First check for possible aligned read of a C 'long'. Unaligned
2953 reads are more expensive, better to defer to another iteration. */
2954 if (!((size_t) q & LONG_PTR_MASK)) {
2955 /* Fast path for runs of non-surrogate chars. */
2956 register const unsigned char *_q = q;
2957 Py_UNICODE *_p = p;
2958 if (native_ordering) {
2959 /* Native ordering is simple: as long as the input cannot
2960 possibly contain a surrogate char, do an unrolled copy
2961 of several 16-bit code points to the target object.
2962 The non-surrogate check is done on several input bytes
2963 at a time (as many as a C 'long' can contain). */
2964 while (_q < aligned_end) {
2965 unsigned long data = * (unsigned long *) _q;
2966 if (data & FAST_CHAR_MASK)
2967 break;
2968 _p[0] = ((unsigned short *) _q)[0];
2969 _p[1] = ((unsigned short *) _q)[1];
2970#if (SIZEOF_LONG == 8)
2971 _p[2] = ((unsigned short *) _q)[2];
2972 _p[3] = ((unsigned short *) _q)[3];
2973#endif
2974 _q += SIZEOF_LONG;
2975 _p += SIZEOF_LONG / 2;
2976 }
2977 }
2978 else {
2979 /* Byteswapped ordering is similar, but we must decompose
2980 the copy bytewise, and take care of zero'ing out the
2981 upper bytes if the target object is in 32-bit units
2982 (that is, in UCS-4 builds). */
2983 while (_q < aligned_end) {
2984 unsigned long data = * (unsigned long *) _q;
2985 if (data & SWAPPED_FAST_CHAR_MASK)
2986 break;
2987 /* Zero upper bytes in UCS-4 builds */
2988#if (Py_UNICODE_SIZE > 2)
2989 _p[0] = 0;
2990 _p[1] = 0;
2991#if (SIZEOF_LONG == 8)
2992 _p[2] = 0;
2993 _p[3] = 0;
2994#endif
2995#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002996 /* Issue #4916; UCS-4 builds on big endian machines must
2997 fill the two last bytes of each 4-byte unit. */
2998#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2999# define OFF 2
3000#else
3001# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003002#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003003 ((unsigned char *) _p)[OFF + 1] = _q[0];
3004 ((unsigned char *) _p)[OFF + 0] = _q[1];
3005 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3006 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3007#if (SIZEOF_LONG == 8)
3008 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3009 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3010 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3011 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3012#endif
3013#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003014 _q += SIZEOF_LONG;
3015 _p += SIZEOF_LONG / 2;
3016 }
3017 }
3018 p = _p;
3019 q = _q;
3020 if (q >= e)
3021 break;
3022 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024
Benjamin Peterson14339b62009-01-31 16:36:08 +00003025 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003026
3027 if (ch < 0xD800 || ch > 0xDFFF) {
3028 *p++ = ch;
3029 continue;
3030 }
3031
3032 /* UTF-16 code pair: */
3033 if (q > e) {
3034 errmsg = "unexpected end of data";
3035 startinpos = (((const char *)q) - 2) - starts;
3036 endinpos = ((const char *)e) + 1 - starts;
3037 goto utf16Error;
3038 }
3039 if (0xD800 <= ch && ch <= 0xDBFF) {
3040 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3041 q += 2;
3042 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003043#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 *p++ = ch;
3045 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003048#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 continue;
3050 }
3051 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003052 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 startinpos = (((const char *)q)-4)-starts;
3054 endinpos = startinpos+2;
3055 goto utf16Error;
3056 }
3057
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 errmsg = "illegal encoding";
3060 startinpos = (((const char *)q)-2)-starts;
3061 endinpos = startinpos+2;
3062 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003063
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 utf16Error:
3065 outpos = p - PyUnicode_AS_UNICODE(unicode);
3066 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003067 errors,
3068 &errorHandler,
3069 "utf16", errmsg,
3070 &starts,
3071 (const char **)&e,
3072 &startinpos,
3073 &endinpos,
3074 &exc,
3075 (const char **)&q,
3076 &unicode,
3077 &outpos,
3078 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003081 /* remaining byte at the end? (size should be even) */
3082 if (e == q) {
3083 if (!consumed) {
3084 errmsg = "truncated data";
3085 startinpos = ((const char *)q) - starts;
3086 endinpos = ((const char *)e) + 1 - starts;
3087 outpos = p - PyUnicode_AS_UNICODE(unicode);
3088 if (unicode_decode_call_errorhandler(
3089 errors,
3090 &errorHandler,
3091 "utf16", errmsg,
3092 &starts,
3093 (const char **)&e,
3094 &startinpos,
3095 &endinpos,
3096 &exc,
3097 (const char **)&q,
3098 &unicode,
3099 &outpos,
3100 &p))
3101 goto onError;
3102 /* The remaining input chars are ignored if the callback
3103 chooses to skip the input */
3104 }
3105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106
3107 if (byteorder)
3108 *byteorder = bo;
3109
Walter Dörwald69652032004-09-07 20:24:22 +00003110 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003114 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 goto onError;
3116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 Py_XDECREF(errorHandler);
3118 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 return (PyObject *)unicode;
3120
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 Py_XDECREF(errorHandler);
3124 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 return NULL;
3126}
3127
Antoine Pitrouab868312009-01-10 15:40:25 +00003128#undef FAST_CHAR_MASK
3129#undef SWAPPED_FAST_CHAR_MASK
3130
Tim Peters772747b2001-08-09 22:21:55 +00003131PyObject *
3132PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 Py_ssize_t size,
3134 const char *errors,
3135 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003137 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003138 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003139 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003140#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003141 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003142#else
3143 const int pairs = 0;
3144#endif
Tim Peters772747b2001-08-09 22:21:55 +00003145 /* Offsets from p for storing byte pairs in the right order. */
3146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3147 int ihi = 1, ilo = 0;
3148#else
3149 int ihi = 0, ilo = 1;
3150#endif
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152#define STORECHAR(CH) \
3153 do { \
3154 p[ihi] = ((CH) >> 8) & 0xff; \
3155 p[ilo] = (CH) & 0xff; \
3156 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003157 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003159#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003160 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 if (s[i] >= 0x10000)
3162 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003163#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003164 /* 2 * (size + pairs + (byteorder == 0)) */
3165 if (size > PY_SSIZE_T_MAX ||
3166 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003168 nsize = size + pairs + (byteorder == 0);
3169 bytesize = nsize * 2;
3170 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003172 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (v == NULL)
3174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003176 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003179 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003180 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003181
3182 if (byteorder == -1) {
3183 /* force LE */
3184 ihi = 1;
3185 ilo = 0;
3186 }
3187 else if (byteorder == 1) {
3188 /* force BE */
3189 ihi = 0;
3190 ilo = 1;
3191 }
3192
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003193 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 Py_UNICODE ch = *s++;
3195 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003196#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 if (ch >= 0x10000) {
3198 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3199 ch = 0xD800 | ((ch-0x10000) >> 10);
3200 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003201#endif
Tim Peters772747b2001-08-09 22:21:55 +00003202 STORECHAR(ch);
3203 if (ch2)
3204 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003205 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003206
3207 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003208 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003209#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210}
3211
3212PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3213{
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 return NULL;
3217 }
3218 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 PyUnicode_GET_SIZE(unicode),
3220 NULL,
3221 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222}
3223
3224/* --- Unicode Escape Codec ----------------------------------------------- */
3225
Fredrik Lundh06d12682001-01-24 07:59:11 +00003226static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003227
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 Py_ssize_t size,
3230 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t startinpos;
3234 Py_ssize_t endinpos;
3235 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003240 char* message;
3241 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 PyObject *errorHandler = NULL;
3243 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 /* Escaped strings will always be longer than the resulting
3246 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 length after conversion to the true value.
3248 (but if the error callback returns a long replacement string
3249 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 v = _PyUnicode_New(size);
3251 if (v == NULL)
3252 goto onError;
3253 if (size == 0)
3254 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003258
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 while (s < end) {
3260 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003261 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263
3264 /* Non-escape characters are interpreted as Unicode ordinals */
3265 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003266 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 continue;
3268 }
3269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 /* \ - Escapes */
3272 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003273 c = *s++;
3274 if (s > end)
3275 c = '\0'; /* Invalid after \ */
3276 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 case '\n': break;
3280 case '\\': *p++ = '\\'; break;
3281 case '\'': *p++ = '\''; break;
3282 case '\"': *p++ = '\"'; break;
3283 case 'b': *p++ = '\b'; break;
3284 case 'f': *p++ = '\014'; break; /* FF */
3285 case 't': *p++ = '\t'; break;
3286 case 'n': *p++ = '\n'; break;
3287 case 'r': *p++ = '\r'; break;
3288 case 'v': *p++ = '\013'; break; /* VT */
3289 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 case '0': case '1': case '2': case '3':
3293 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003294 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003295 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003296 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003297 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003298 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003300 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 break;
3302
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 /* hex escapes */
3304 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003306 digits = 2;
3307 message = "truncated \\xXX escape";
3308 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003312 digits = 4;
3313 message = "truncated \\uXXXX escape";
3314 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003317 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003318 digits = 8;
3319 message = "truncated \\UXXXXXXXX escape";
3320 hexescape:
3321 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 outpos = p-PyUnicode_AS_UNICODE(v);
3323 if (s+digits>end) {
3324 endinpos = size;
3325 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 errors, &errorHandler,
3327 "unicodeescape", "end of string in escape sequence",
3328 &starts, &end, &startinpos, &endinpos, &exc, &s,
3329 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 goto onError;
3331 goto nextByte;
3332 }
3333 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003334 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003335 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 endinpos = (s+i+1)-starts;
3337 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 errors, &errorHandler,
3339 "unicodeescape", message,
3340 &starts, &end, &startinpos, &endinpos, &exc, &s,
3341 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003344 }
3345 chr = (chr<<4) & ~0xF;
3346 if (c >= '0' && c <= '9')
3347 chr += c - '0';
3348 else if (c >= 'a' && c <= 'f')
3349 chr += 10 + c - 'a';
3350 else
3351 chr += 10 + c - 'A';
3352 }
3353 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003354 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 /* _decoding_error will have already written into the
3356 target buffer. */
3357 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003358 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003359 /* when we get here, chr is a 32-bit unicode character */
3360 if (chr <= 0xffff)
3361 /* UCS-2 character */
3362 *p++ = (Py_UNICODE) chr;
3363 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003364 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003365 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003366#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003367 *p++ = chr;
3368#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003369 chr -= 0x10000L;
3370 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003371 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003372#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003373 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 endinpos = s-starts;
3375 outpos = p-PyUnicode_AS_UNICODE(v);
3376 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 errors, &errorHandler,
3378 "unicodeescape", "illegal Unicode character",
3379 &starts, &end, &startinpos, &endinpos, &exc, &s,
3380 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003381 goto onError;
3382 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003383 break;
3384
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003386 case 'N':
3387 message = "malformed \\N character escape";
3388 if (ucnhash_CAPI == NULL) {
3389 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003390 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003391 if (ucnhash_CAPI == NULL)
3392 goto ucnhashError;
3393 }
3394 if (*s == '{') {
3395 const char *start = s+1;
3396 /* look for the closing brace */
3397 while (*s != '}' && s < end)
3398 s++;
3399 if (s > start && s < end && *s == '}') {
3400 /* found a name. look it up in the unicode database */
3401 message = "unknown Unicode character name";
3402 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003403 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003404 goto store;
3405 }
3406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 endinpos = s-starts;
3408 outpos = p-PyUnicode_AS_UNICODE(v);
3409 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 errors, &errorHandler,
3411 "unicodeescape", message,
3412 &starts, &end, &startinpos, &endinpos, &exc, &s,
3413 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003414 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003415 break;
3416
3417 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003418 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 message = "\\ at end of string";
3420 s--;
3421 endinpos = s-starts;
3422 outpos = p-PyUnicode_AS_UNICODE(v);
3423 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 errors, &errorHandler,
3425 "unicodeescape", message,
3426 &starts, &end, &startinpos, &endinpos, &exc, &s,
3427 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003428 goto onError;
3429 }
3430 else {
3431 *p++ = '\\';
3432 *p++ = (unsigned char)s[-1];
3433 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003434 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003439 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003441 Py_XDECREF(errorHandler);
3442 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003444
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003446 PyErr_SetString(
3447 PyExc_UnicodeError,
3448 "\\N escapes not supported (can't load unicodedata module)"
3449 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003450 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 Py_XDECREF(errorHandler);
3452 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003453 return NULL;
3454
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 Py_XDECREF(errorHandler);
3458 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 return NULL;
3460}
3461
3462/* Return a Unicode-Escape string version of the Unicode object.
3463
3464 If quotes is true, the string is enclosed in u"" or u'' quotes as
3465 appropriate.
3466
3467*/
3468
Thomas Wouters477c8d52006-05-27 19:21:47 +00003469Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 Py_ssize_t size,
3471 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003472{
3473 /* like wcschr, but doesn't stop at NULL characters */
3474
3475 while (size-- > 0) {
3476 if (*s == ch)
3477 return s;
3478 s++;
3479 }
3480
3481 return NULL;
3482}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003483
Walter Dörwald79e913e2007-05-12 11:08:06 +00003484static const char *hexdigits = "0123456789abcdef";
3485
3486PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003489 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003492#ifdef Py_UNICODE_WIDE
3493 const Py_ssize_t expandsize = 10;
3494#else
3495 const Py_ssize_t expandsize = 6;
3496#endif
3497
Thomas Wouters89f507f2006-12-13 04:49:30 +00003498 /* XXX(nnorwitz): rather than over-allocating, it would be
3499 better to choose a different scheme. Perhaps scan the
3500 first N-chars of the string and allocate based on that size.
3501 */
3502 /* Initial allocation is based on the longest-possible unichr
3503 escape.
3504
3505 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3506 unichr, so in this case it's the longest unichr escape. In
3507 narrow (UTF-16) builds this is five chars per source unichr
3508 since there are two unichrs in the surrogate pair, so in narrow
3509 (UTF-16) builds it's not the longest unichr escape.
3510
3511 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3512 so in the narrow (UTF-16) build case it's the longest unichr
3513 escape.
3514 */
3515
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003516 if (size == 0)
3517 return PyBytes_FromStringAndSize(NULL, 0);
3518
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003519 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003521
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003522 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 2
3524 + expandsize*size
3525 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 if (repr == NULL)
3527 return NULL;
3528
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003529 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 while (size-- > 0) {
3532 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003533
Walter Dörwald79e913e2007-05-12 11:08:06 +00003534 /* Escape backslashes */
3535 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 *p++ = '\\';
3537 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003538 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003539 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003540
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003541#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003542 /* Map 21-bit characters to '\U00xxxxxx' */
3543 else if (ch >= 0x10000) {
3544 *p++ = '\\';
3545 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003546 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3547 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3548 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3549 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3550 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3551 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3552 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3553 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003555 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003556#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3558 else if (ch >= 0xD800 && ch < 0xDC00) {
3559 Py_UNICODE ch2;
3560 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003561
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 ch2 = *s++;
3563 size--;
3564 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3565 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3566 *p++ = '\\';
3567 *p++ = 'U';
3568 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3569 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3570 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3571 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3572 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3573 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3574 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3575 *p++ = hexdigits[ucs & 0x0000000F];
3576 continue;
3577 }
3578 /* Fall through: isolated surrogates are copied as-is */
3579 s--;
3580 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003581 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003582#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003583
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003585 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 *p++ = '\\';
3587 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003588 *p++ = hexdigits[(ch >> 12) & 0x000F];
3589 *p++ = hexdigits[(ch >> 8) & 0x000F];
3590 *p++ = hexdigits[(ch >> 4) & 0x000F];
3591 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003593
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003594 /* Map special whitespace to '\t', \n', '\r' */
3595 else if (ch == '\t') {
3596 *p++ = '\\';
3597 *p++ = 't';
3598 }
3599 else if (ch == '\n') {
3600 *p++ = '\\';
3601 *p++ = 'n';
3602 }
3603 else if (ch == '\r') {
3604 *p++ = '\\';
3605 *p++ = 'r';
3606 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003607
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003608 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003609 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003611 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003612 *p++ = hexdigits[(ch >> 4) & 0x000F];
3613 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003614 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 /* Copy everything else as-is */
3617 else
3618 *p++ = (char) ch;
3619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003621 assert(p - PyBytes_AS_STRING(repr) > 0);
3622 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3623 return NULL;
3624 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625}
3626
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003627PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003629 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 if (!PyUnicode_Check(unicode)) {
3631 PyErr_BadArgument();
3632 return NULL;
3633 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003634 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3635 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003636 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637}
3638
3639/* --- Raw Unicode Escape Codec ------------------------------------------- */
3640
3641PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 Py_ssize_t size,
3643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003646 Py_ssize_t startinpos;
3647 Py_ssize_t endinpos;
3648 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 const char *end;
3652 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 PyObject *errorHandler = NULL;
3654 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 /* Escaped strings will always be longer than the resulting
3657 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 length after conversion to the true value. (But decoding error
3659 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 v = _PyUnicode_New(size);
3661 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 end = s + size;
3667 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 unsigned char c;
3669 Py_UCS4 x;
3670 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003671 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 /* Non-escape characters are interpreted as Unicode ordinals */
3674 if (*s != '\\') {
3675 *p++ = (unsigned char)*s++;
3676 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 startinpos = s-starts;
3679
3680 /* \u-escapes are only interpreted iff the number of leading
3681 backslashes if odd */
3682 bs = s;
3683 for (;s < end;) {
3684 if (*s != '\\')
3685 break;
3686 *p++ = (unsigned char)*s++;
3687 }
3688 if (((s - bs) & 1) == 0 ||
3689 s >= end ||
3690 (*s != 'u' && *s != 'U')) {
3691 continue;
3692 }
3693 p--;
3694 count = *s=='u' ? 4 : 8;
3695 s++;
3696
3697 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3698 outpos = p-PyUnicode_AS_UNICODE(v);
3699 for (x = 0, i = 0; i < count; ++i, ++s) {
3700 c = (unsigned char)*s;
3701 if (!ISXDIGIT(c)) {
3702 endinpos = s-starts;
3703 if (unicode_decode_call_errorhandler(
3704 errors, &errorHandler,
3705 "rawunicodeescape", "truncated \\uXXXX",
3706 &starts, &end, &startinpos, &endinpos, &exc, &s,
3707 &v, &outpos, &p))
3708 goto onError;
3709 goto nextByte;
3710 }
3711 x = (x<<4) & ~0xF;
3712 if (c >= '0' && c <= '9')
3713 x += c - '0';
3714 else if (c >= 'a' && c <= 'f')
3715 x += 10 + c - 'a';
3716 else
3717 x += 10 + c - 'A';
3718 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003719 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 /* UCS-2 character */
3721 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003722 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 /* UCS-4 character. Either store directly, or as
3724 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003725#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003727#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 x -= 0x10000L;
3729 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3730 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003731#endif
3732 } else {
3733 endinpos = s-starts;
3734 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003735 if (unicode_decode_call_errorhandler(
3736 errors, &errorHandler,
3737 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003738 &starts, &end, &startinpos, &endinpos, &exc, &s,
3739 &v, &outpos, &p))
3740 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003742 nextByte:
3743 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003745 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_XDECREF(errorHandler);
3748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 Py_XDECREF(errorHandler);
3754 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 return NULL;
3756}
3757
3758PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003761 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 char *p;
3763 char *q;
3764
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003765#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003766 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003767#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003768 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003769#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003770
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003771 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003773
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003774 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 if (repr == NULL)
3776 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003777 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003778 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003780 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 while (size-- > 0) {
3782 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003783#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 /* Map 32-bit characters to '\Uxxxxxxxx' */
3785 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003786 *p++ = '\\';
3787 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003788 *p++ = hexdigits[(ch >> 28) & 0xf];
3789 *p++ = hexdigits[(ch >> 24) & 0xf];
3790 *p++ = hexdigits[(ch >> 20) & 0xf];
3791 *p++ = hexdigits[(ch >> 16) & 0xf];
3792 *p++ = hexdigits[(ch >> 12) & 0xf];
3793 *p++ = hexdigits[(ch >> 8) & 0xf];
3794 *p++ = hexdigits[(ch >> 4) & 0xf];
3795 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003796 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003797 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003798#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3800 if (ch >= 0xD800 && ch < 0xDC00) {
3801 Py_UNICODE ch2;
3802 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003803
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 ch2 = *s++;
3805 size--;
3806 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3807 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3808 *p++ = '\\';
3809 *p++ = 'U';
3810 *p++ = hexdigits[(ucs >> 28) & 0xf];
3811 *p++ = hexdigits[(ucs >> 24) & 0xf];
3812 *p++ = hexdigits[(ucs >> 20) & 0xf];
3813 *p++ = hexdigits[(ucs >> 16) & 0xf];
3814 *p++ = hexdigits[(ucs >> 12) & 0xf];
3815 *p++ = hexdigits[(ucs >> 8) & 0xf];
3816 *p++ = hexdigits[(ucs >> 4) & 0xf];
3817 *p++ = hexdigits[ucs & 0xf];
3818 continue;
3819 }
3820 /* Fall through: isolated surrogates are copied as-is */
3821 s--;
3822 size++;
3823 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003824#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003825 /* Map 16-bit characters to '\uxxxx' */
3826 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 *p++ = '\\';
3828 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003829 *p++ = hexdigits[(ch >> 12) & 0xf];
3830 *p++ = hexdigits[(ch >> 8) & 0xf];
3831 *p++ = hexdigits[(ch >> 4) & 0xf];
3832 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 /* Copy everything else as-is */
3835 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 *p++ = (char) ch;
3837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003838 size = p - q;
3839
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003840 assert(size > 0);
3841 if (_PyBytes_Resize(&repr, size) < 0)
3842 return NULL;
3843 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844}
3845
3846PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3847{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003848 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003850 PyErr_BadArgument();
3851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003853 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3854 PyUnicode_GET_SIZE(unicode));
3855
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003856 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857}
3858
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003859/* --- Unicode Internal Codec ------------------------------------------- */
3860
3861PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 Py_ssize_t size,
3863 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003864{
3865 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003866 Py_ssize_t startinpos;
3867 Py_ssize_t endinpos;
3868 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003869 PyUnicodeObject *v;
3870 Py_UNICODE *p;
3871 const char *end;
3872 const char *reason;
3873 PyObject *errorHandler = NULL;
3874 PyObject *exc = NULL;
3875
Neal Norwitzd43069c2006-01-08 01:12:10 +00003876#ifdef Py_UNICODE_WIDE
3877 Py_UNICODE unimax = PyUnicode_GetMax();
3878#endif
3879
Thomas Wouters89f507f2006-12-13 04:49:30 +00003880 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003881 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3882 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003884 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003885 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003886 p = PyUnicode_AS_UNICODE(v);
3887 end = s + size;
3888
3889 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003890 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003891 /* We have to sanity check the raw data, otherwise doom looms for
3892 some malformed UCS-4 data. */
3893 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003894#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003895 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003896#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003897 end-s < Py_UNICODE_SIZE
3898 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003900 startinpos = s - starts;
3901 if (end-s < Py_UNICODE_SIZE) {
3902 endinpos = end-starts;
3903 reason = "truncated input";
3904 }
3905 else {
3906 endinpos = s - starts + Py_UNICODE_SIZE;
3907 reason = "illegal code point (> 0x10FFFF)";
3908 }
3909 outpos = p - PyUnicode_AS_UNICODE(v);
3910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003914 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003915 goto onError;
3916 }
3917 }
3918 else {
3919 p++;
3920 s += Py_UNICODE_SIZE;
3921 }
3922 }
3923
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003924 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003925 goto onError;
3926 Py_XDECREF(errorHandler);
3927 Py_XDECREF(exc);
3928 return (PyObject *)v;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003931 Py_XDECREF(v);
3932 Py_XDECREF(errorHandler);
3933 Py_XDECREF(exc);
3934 return NULL;
3935}
3936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937/* --- Latin-1 Codec ------------------------------------------------------ */
3938
3939PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 Py_ssize_t size,
3941 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
3943 PyUnicodeObject *v;
3944 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003945 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003946
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003948 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 Py_UNICODE r = *(unsigned char*)s;
3950 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003951 }
3952
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 v = _PyUnicode_New(size);
3954 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003959 e = s + size;
3960 /* Unrolling the copy makes it much faster by reducing the looping
3961 overhead. This is similar to what many memcpy() implementations do. */
3962 unrolled_end = e - 4;
3963 while (s < unrolled_end) {
3964 p[0] = (unsigned char) s[0];
3965 p[1] = (unsigned char) s[1];
3966 p[2] = (unsigned char) s[2];
3967 p[3] = (unsigned char) s[3];
3968 s += 4;
3969 p += 4;
3970 }
3971 while (s < e)
3972 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003974
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 Py_XDECREF(v);
3977 return NULL;
3978}
3979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980/* create or adjust a UnicodeEncodeError */
3981static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 const char *encoding,
3983 const Py_UNICODE *unicode, Py_ssize_t size,
3984 Py_ssize_t startpos, Py_ssize_t endpos,
3985 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 *exceptionObject = PyUnicodeEncodeError_Create(
3989 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 }
3991 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3993 goto onError;
3994 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3995 goto onError;
3996 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3997 goto onError;
3998 return;
3999 onError:
4000 Py_DECREF(*exceptionObject);
4001 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 }
4003}
4004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005/* raises a UnicodeEncodeError */
4006static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 const char *encoding,
4008 const Py_UNICODE *unicode, Py_ssize_t size,
4009 Py_ssize_t startpos, Py_ssize_t endpos,
4010 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011{
4012 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016}
4017
4018/* error handling callback helper:
4019 build arguments, call the callback and check the arguments,
4020 put the result into newpos and return the replacement string, which
4021 has to be freed by the caller */
4022static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 PyObject **errorHandler,
4024 const char *encoding, const char *reason,
4025 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4026 Py_ssize_t startpos, Py_ssize_t endpos,
4027 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004029 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030
4031 PyObject *restuple;
4032 PyObject *resunicode;
4033
4034 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 }
4039
4040 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004050 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 Py_DECREF(restuple);
4052 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004054 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 &resunicode, newpos)) {
4056 Py_DECREF(restuple);
4057 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004059 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4060 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4061 Py_DECREF(restuple);
4062 return NULL;
4063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004066 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4068 Py_DECREF(restuple);
4069 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 Py_INCREF(resunicode);
4072 Py_DECREF(restuple);
4073 return resunicode;
4074}
4075
4076static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 Py_ssize_t size,
4078 const char *errors,
4079 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080{
4081 /* output object */
4082 PyObject *res;
4083 /* pointers to the beginning and end+1 of input */
4084 const Py_UNICODE *startp = p;
4085 const Py_UNICODE *endp = p + size;
4086 /* pointer to the beginning of the unencodable characters */
4087 /* const Py_UNICODE *badp = NULL; */
4088 /* pointer into the output */
4089 char *str;
4090 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004091 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004092 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4093 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 PyObject *errorHandler = NULL;
4095 PyObject *exc = NULL;
4096 /* the following variable is used for caching string comparisons
4097 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4098 int known_errorHandler = -1;
4099
4100 /* allocate enough for a simple encoding without
4101 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004102 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004104 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004106 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004107 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 ressize = size;
4109
4110 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 /* can we encode this? */
4114 if (c<limit) {
4115 /* no overflow check, because we know that the space is enough */
4116 *str++ = (char)c;
4117 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004118 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 else {
4120 Py_ssize_t unicodepos = p-startp;
4121 Py_ssize_t requiredsize;
4122 PyObject *repunicode;
4123 Py_ssize_t repsize;
4124 Py_ssize_t newpos;
4125 Py_ssize_t respos;
4126 Py_UNICODE *uni2;
4127 /* startpos for collecting unencodable chars */
4128 const Py_UNICODE *collstart = p;
4129 const Py_UNICODE *collend = p;
4130 /* find all unecodable characters */
4131 while ((collend < endp) && ((*collend)>=limit))
4132 ++collend;
4133 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4134 if (known_errorHandler==-1) {
4135 if ((errors==NULL) || (!strcmp(errors, "strict")))
4136 known_errorHandler = 1;
4137 else if (!strcmp(errors, "replace"))
4138 known_errorHandler = 2;
4139 else if (!strcmp(errors, "ignore"))
4140 known_errorHandler = 3;
4141 else if (!strcmp(errors, "xmlcharrefreplace"))
4142 known_errorHandler = 4;
4143 else
4144 known_errorHandler = 0;
4145 }
4146 switch (known_errorHandler) {
4147 case 1: /* strict */
4148 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4149 goto onError;
4150 case 2: /* replace */
4151 while (collstart++<collend)
4152 *str++ = '?'; /* fall through */
4153 case 3: /* ignore */
4154 p = collend;
4155 break;
4156 case 4: /* xmlcharrefreplace */
4157 respos = str - PyBytes_AS_STRING(res);
4158 /* determine replacement size (temporarily (mis)uses p) */
4159 for (p = collstart, repsize = 0; p < collend; ++p) {
4160 if (*p<10)
4161 repsize += 2+1+1;
4162 else if (*p<100)
4163 repsize += 2+2+1;
4164 else if (*p<1000)
4165 repsize += 2+3+1;
4166 else if (*p<10000)
4167 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004168#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 else
4170 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004171#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 else if (*p<100000)
4173 repsize += 2+5+1;
4174 else if (*p<1000000)
4175 repsize += 2+6+1;
4176 else
4177 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004178#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 }
4180 requiredsize = respos+repsize+(endp-collend);
4181 if (requiredsize > ressize) {
4182 if (requiredsize<2*ressize)
4183 requiredsize = 2*ressize;
4184 if (_PyBytes_Resize(&res, requiredsize))
4185 goto onError;
4186 str = PyBytes_AS_STRING(res) + respos;
4187 ressize = requiredsize;
4188 }
4189 /* generate replacement (temporarily (mis)uses p) */
4190 for (p = collstart; p < collend; ++p) {
4191 str += sprintf(str, "&#%d;", (int)*p);
4192 }
4193 p = collend;
4194 break;
4195 default:
4196 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4197 encoding, reason, startp, size, &exc,
4198 collstart-startp, collend-startp, &newpos);
4199 if (repunicode == NULL)
4200 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004201 if (PyBytes_Check(repunicode)) {
4202 /* Directly copy bytes result to output. */
4203 repsize = PyBytes_Size(repunicode);
4204 if (repsize > 1) {
4205 /* Make room for all additional bytes. */
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004206 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004207 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4208 Py_DECREF(repunicode);
4209 goto onError;
4210 }
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004211 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004212 ressize += repsize-1;
4213 }
4214 memcpy(str, PyBytes_AsString(repunicode), repsize);
4215 str += repsize;
4216 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004217 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004218 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 /* need more space? (at least enough for what we
4221 have+the replacement+the rest of the string, so
4222 we won't have to check space for encodable characters) */
4223 respos = str - PyBytes_AS_STRING(res);
4224 repsize = PyUnicode_GET_SIZE(repunicode);
4225 requiredsize = respos+repsize+(endp-collend);
4226 if (requiredsize > ressize) {
4227 if (requiredsize<2*ressize)
4228 requiredsize = 2*ressize;
4229 if (_PyBytes_Resize(&res, requiredsize)) {
4230 Py_DECREF(repunicode);
4231 goto onError;
4232 }
4233 str = PyBytes_AS_STRING(res) + respos;
4234 ressize = requiredsize;
4235 }
4236 /* check if there is anything unencodable in the replacement
4237 and copy it to the output */
4238 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4239 c = *uni2;
4240 if (c >= limit) {
4241 raise_encode_exception(&exc, encoding, startp, size,
4242 unicodepos, unicodepos+1, reason);
4243 Py_DECREF(repunicode);
4244 goto onError;
4245 }
4246 *str = (char)c;
4247 }
4248 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004249 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004251 }
4252 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004253 /* Resize if we allocated to much */
4254 size = str - PyBytes_AS_STRING(res);
4255 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004256 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004257 if (_PyBytes_Resize(&res, size) < 0)
4258 goto onError;
4259 }
4260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 Py_XDECREF(errorHandler);
4262 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004263 return res;
4264
4265 onError:
4266 Py_XDECREF(res);
4267 Py_XDECREF(errorHandler);
4268 Py_XDECREF(exc);
4269 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270}
4271
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 Py_ssize_t size,
4274 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277}
4278
4279PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4280{
4281 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 PyErr_BadArgument();
4283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 }
4285 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 PyUnicode_GET_SIZE(unicode),
4287 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288}
4289
4290/* --- 7-bit ASCII Codec -------------------------------------------------- */
4291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 Py_ssize_t size,
4294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 PyUnicodeObject *v;
4298 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004299 Py_ssize_t startinpos;
4300 Py_ssize_t endinpos;
4301 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 const char *e;
4303 PyObject *errorHandler = NULL;
4304 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004307 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 Py_UNICODE r = *(unsigned char*)s;
4309 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004310 }
Tim Petersced69f82003-09-16 20:30:58 +00004311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 v = _PyUnicode_New(size);
4313 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 e = s + size;
4319 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 register unsigned char c = (unsigned char)*s;
4321 if (c < 128) {
4322 *p++ = c;
4323 ++s;
4324 }
4325 else {
4326 startinpos = s-starts;
4327 endinpos = startinpos + 1;
4328 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4329 if (unicode_decode_call_errorhandler(
4330 errors, &errorHandler,
4331 "ascii", "ordinal not in range(128)",
4332 &starts, &e, &startinpos, &endinpos, &exc, &s,
4333 &v, &outpos, &p))
4334 goto onError;
4335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004337 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4339 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 Py_XDECREF(errorHandler);
4341 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 Py_XDECREF(errorHandler);
4347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 return NULL;
4349}
4350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 Py_ssize_t size,
4353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356}
4357
4358PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4359{
4360 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 PyErr_BadArgument();
4362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 }
4364 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 PyUnicode_GET_SIZE(unicode),
4366 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367}
4368
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004369#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004370
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004371/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004372
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004373#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004374#define NEED_RETRY
4375#endif
4376
4377/* XXX This code is limited to "true" double-byte encodings, as
4378 a) it assumes an incomplete character consists of a single byte, and
4379 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004381
4382static int is_dbcs_lead_byte(const char *s, int offset)
4383{
4384 const char *curr = s + offset;
4385
4386 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 const char *prev = CharPrev(s, curr);
4388 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004389 }
4390 return 0;
4391}
4392
4393/*
4394 * Decode MBCS string into unicode object. If 'final' is set, converts
4395 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4396 */
4397static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 const char *s, /* MBCS string */
4399 int size, /* sizeof MBCS string */
4400 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004401{
4402 Py_UNICODE *p;
4403 Py_ssize_t n = 0;
4404 int usize = 0;
4405
4406 assert(size >= 0);
4407
4408 /* Skip trailing lead-byte unless 'final' is set */
4409 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004411
4412 /* First get the size of the result */
4413 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4415 if (usize == 0) {
4416 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4417 return -1;
4418 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004419 }
4420
4421 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 /* Create unicode object */
4423 *v = _PyUnicode_New(usize);
4424 if (*v == NULL)
4425 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004426 }
4427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 /* Extend unicode object */
4429 n = PyUnicode_GET_SIZE(*v);
4430 if (_PyUnicode_Resize(v, n + usize) < 0)
4431 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004432 }
4433
4434 /* Do the conversion */
4435 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 p = PyUnicode_AS_UNICODE(*v) + n;
4437 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4438 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4439 return -1;
4440 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004441 }
4442
4443 return size;
4444}
4445
4446PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 Py_ssize_t size,
4448 const char *errors,
4449 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004450{
4451 PyUnicodeObject *v = NULL;
4452 int done;
4453
4454 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004456
4457#ifdef NEED_RETRY
4458 retry:
4459 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004461 else
4462#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004464
4465 if (done < 0) {
4466 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004468 }
4469
4470 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004472
4473#ifdef NEED_RETRY
4474 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 s += done;
4476 size -= done;
4477 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004478 }
4479#endif
4480
4481 return (PyObject *)v;
4482}
4483
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004484PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 Py_ssize_t size,
4486 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004487{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004488 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4489}
4490
4491/*
4492 * Convert unicode into string object (MBCS).
4493 * Returns 0 if succeed, -1 otherwise.
4494 */
4495static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 const Py_UNICODE *p, /* unicode */
4497 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498{
4499 int mbcssize = 0;
4500 Py_ssize_t n = 0;
4501
4502 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004503
4504 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004505 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4507 if (mbcssize == 0) {
4508 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4509 return -1;
4510 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004511 }
4512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004513 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 /* Create string object */
4515 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4516 if (*repr == NULL)
4517 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004518 }
4519 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 /* Extend string object */
4521 n = PyBytes_Size(*repr);
4522 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4523 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004524 }
4525
4526 /* Do the conversion */
4527 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 char *s = PyBytes_AS_STRING(*repr) + n;
4529 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4530 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4531 return -1;
4532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004533 }
4534
4535 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004536}
4537
4538PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 Py_ssize_t size,
4540 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004541{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004542 PyObject *repr = NULL;
4543 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004544
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549 else
4550#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004552
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004553 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 Py_XDECREF(repr);
4555 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004556 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557
4558#ifdef NEED_RETRY
4559 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 p += INT_MAX;
4561 size -= INT_MAX;
4562 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004563 }
4564#endif
4565
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004566 return repr;
4567}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004568
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004569PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4570{
4571 if (!PyUnicode_Check(unicode)) {
4572 PyErr_BadArgument();
4573 return NULL;
4574 }
4575 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 PyUnicode_GET_SIZE(unicode),
4577 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004578}
4579
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580#undef NEED_RETRY
4581
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004582#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584/* --- Character Mapping Codec -------------------------------------------- */
4585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_ssize_t size,
4588 PyObject *mapping,
4589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t startinpos;
4593 Py_ssize_t endinpos;
4594 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 PyUnicodeObject *v;
4597 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004598 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 PyObject *errorHandler = NULL;
4600 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004601 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 /* Default to Latin-1 */
4605 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 v = _PyUnicode_New(size);
4609 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004615 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 mapstring = PyUnicode_AS_UNICODE(mapping);
4617 maplen = PyUnicode_GET_SIZE(mapping);
4618 while (s < e) {
4619 unsigned char ch = *s;
4620 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 if (ch < maplen)
4623 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 if (x == 0xfffe) {
4626 /* undefined mapping */
4627 outpos = p-PyUnicode_AS_UNICODE(v);
4628 startinpos = s-starts;
4629 endinpos = startinpos+1;
4630 if (unicode_decode_call_errorhandler(
4631 errors, &errorHandler,
4632 "charmap", "character maps to <undefined>",
4633 &starts, &e, &startinpos, &endinpos, &exc, &s,
4634 &v, &outpos, &p)) {
4635 goto onError;
4636 }
4637 continue;
4638 }
4639 *p++ = x;
4640 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004641 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004642 }
4643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 while (s < e) {
4645 unsigned char ch = *s;
4646 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004647
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4649 w = PyLong_FromLong((long)ch);
4650 if (w == NULL)
4651 goto onError;
4652 x = PyObject_GetItem(mapping, w);
4653 Py_DECREF(w);
4654 if (x == NULL) {
4655 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4656 /* No mapping found means: mapping is undefined. */
4657 PyErr_Clear();
4658 x = Py_None;
4659 Py_INCREF(x);
4660 } else
4661 goto onError;
4662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004663
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 /* Apply mapping */
4665 if (PyLong_Check(x)) {
4666 long value = PyLong_AS_LONG(x);
4667 if (value < 0 || value > 65535) {
4668 PyErr_SetString(PyExc_TypeError,
4669 "character mapping must be in range(65536)");
4670 Py_DECREF(x);
4671 goto onError;
4672 }
4673 *p++ = (Py_UNICODE)value;
4674 }
4675 else if (x == Py_None) {
4676 /* undefined mapping */
4677 outpos = p-PyUnicode_AS_UNICODE(v);
4678 startinpos = s-starts;
4679 endinpos = startinpos+1;
4680 if (unicode_decode_call_errorhandler(
4681 errors, &errorHandler,
4682 "charmap", "character maps to <undefined>",
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
4684 &v, &outpos, &p)) {
4685 Py_DECREF(x);
4686 goto onError;
4687 }
4688 Py_DECREF(x);
4689 continue;
4690 }
4691 else if (PyUnicode_Check(x)) {
4692 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004693
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 if (targetsize == 1)
4695 /* 1-1 mapping */
4696 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004697
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 else if (targetsize > 1) {
4699 /* 1-n mapping */
4700 if (targetsize > extrachars) {
4701 /* resize first */
4702 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4703 Py_ssize_t needed = (targetsize - extrachars) + \
4704 (targetsize << 2);
4705 extrachars += needed;
4706 /* XXX overflow detection missing */
4707 if (_PyUnicode_Resize(&v,
4708 PyUnicode_GET_SIZE(v) + needed) < 0) {
4709 Py_DECREF(x);
4710 goto onError;
4711 }
4712 p = PyUnicode_AS_UNICODE(v) + oldpos;
4713 }
4714 Py_UNICODE_COPY(p,
4715 PyUnicode_AS_UNICODE(x),
4716 targetsize);
4717 p += targetsize;
4718 extrachars -= targetsize;
4719 }
4720 /* 1-0 mapping: skip the character */
4721 }
4722 else {
4723 /* wrong return value */
4724 PyErr_SetString(PyExc_TypeError,
4725 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004726 Py_DECREF(x);
4727 goto onError;
4728 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 Py_DECREF(x);
4730 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 }
4733 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(errorHandler);
4737 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004739
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_XDECREF(v);
4744 return NULL;
4745}
4746
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004747/* Charmap encoding: the lookup table */
4748
4749struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 PyObject_HEAD
4751 unsigned char level1[32];
4752 int count2, count3;
4753 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004754};
4755
4756static PyObject*
4757encoding_map_size(PyObject *obj, PyObject* args)
4758{
4759 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004760 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004762}
4763
4764static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 PyDoc_STR("Return the size (in bytes) of this object") },
4767 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004768};
4769
4770static void
4771encoding_map_dealloc(PyObject* o)
4772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004773 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004774}
4775
4776static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 "EncodingMap", /*tp_name*/
4779 sizeof(struct encoding_map), /*tp_basicsize*/
4780 0, /*tp_itemsize*/
4781 /* methods */
4782 encoding_map_dealloc, /*tp_dealloc*/
4783 0, /*tp_print*/
4784 0, /*tp_getattr*/
4785 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004786 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 0, /*tp_repr*/
4788 0, /*tp_as_number*/
4789 0, /*tp_as_sequence*/
4790 0, /*tp_as_mapping*/
4791 0, /*tp_hash*/
4792 0, /*tp_call*/
4793 0, /*tp_str*/
4794 0, /*tp_getattro*/
4795 0, /*tp_setattro*/
4796 0, /*tp_as_buffer*/
4797 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4798 0, /*tp_doc*/
4799 0, /*tp_traverse*/
4800 0, /*tp_clear*/
4801 0, /*tp_richcompare*/
4802 0, /*tp_weaklistoffset*/
4803 0, /*tp_iter*/
4804 0, /*tp_iternext*/
4805 encoding_map_methods, /*tp_methods*/
4806 0, /*tp_members*/
4807 0, /*tp_getset*/
4808 0, /*tp_base*/
4809 0, /*tp_dict*/
4810 0, /*tp_descr_get*/
4811 0, /*tp_descr_set*/
4812 0, /*tp_dictoffset*/
4813 0, /*tp_init*/
4814 0, /*tp_alloc*/
4815 0, /*tp_new*/
4816 0, /*tp_free*/
4817 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004818};
4819
4820PyObject*
4821PyUnicode_BuildEncodingMap(PyObject* string)
4822{
4823 Py_UNICODE *decode;
4824 PyObject *result;
4825 struct encoding_map *mresult;
4826 int i;
4827 int need_dict = 0;
4828 unsigned char level1[32];
4829 unsigned char level2[512];
4830 unsigned char *mlevel1, *mlevel2, *mlevel3;
4831 int count2 = 0, count3 = 0;
4832
4833 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4834 PyErr_BadArgument();
4835 return NULL;
4836 }
4837 decode = PyUnicode_AS_UNICODE(string);
4838 memset(level1, 0xFF, sizeof level1);
4839 memset(level2, 0xFF, sizeof level2);
4840
4841 /* If there isn't a one-to-one mapping of NULL to \0,
4842 or if there are non-BMP characters, we need to use
4843 a mapping dictionary. */
4844 if (decode[0] != 0)
4845 need_dict = 1;
4846 for (i = 1; i < 256; i++) {
4847 int l1, l2;
4848 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004849#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004850 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004851#endif
4852 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004853 need_dict = 1;
4854 break;
4855 }
4856 if (decode[i] == 0xFFFE)
4857 /* unmapped character */
4858 continue;
4859 l1 = decode[i] >> 11;
4860 l2 = decode[i] >> 7;
4861 if (level1[l1] == 0xFF)
4862 level1[l1] = count2++;
4863 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004864 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004865 }
4866
4867 if (count2 >= 0xFF || count3 >= 0xFF)
4868 need_dict = 1;
4869
4870 if (need_dict) {
4871 PyObject *result = PyDict_New();
4872 PyObject *key, *value;
4873 if (!result)
4874 return NULL;
4875 for (i = 0; i < 256; i++) {
4876 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004877 key = PyLong_FromLong(decode[i]);
4878 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004879 if (!key || !value)
4880 goto failed1;
4881 if (PyDict_SetItem(result, key, value) == -1)
4882 goto failed1;
4883 Py_DECREF(key);
4884 Py_DECREF(value);
4885 }
4886 return result;
4887 failed1:
4888 Py_XDECREF(key);
4889 Py_XDECREF(value);
4890 Py_DECREF(result);
4891 return NULL;
4892 }
4893
4894 /* Create a three-level trie */
4895 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4896 16*count2 + 128*count3 - 1);
4897 if (!result)
4898 return PyErr_NoMemory();
4899 PyObject_Init(result, &EncodingMapType);
4900 mresult = (struct encoding_map*)result;
4901 mresult->count2 = count2;
4902 mresult->count3 = count3;
4903 mlevel1 = mresult->level1;
4904 mlevel2 = mresult->level23;
4905 mlevel3 = mresult->level23 + 16*count2;
4906 memcpy(mlevel1, level1, 32);
4907 memset(mlevel2, 0xFF, 16*count2);
4908 memset(mlevel3, 0, 128*count3);
4909 count3 = 0;
4910 for (i = 1; i < 256; i++) {
4911 int o1, o2, o3, i2, i3;
4912 if (decode[i] == 0xFFFE)
4913 /* unmapped character */
4914 continue;
4915 o1 = decode[i]>>11;
4916 o2 = (decode[i]>>7) & 0xF;
4917 i2 = 16*mlevel1[o1] + o2;
4918 if (mlevel2[i2] == 0xFF)
4919 mlevel2[i2] = count3++;
4920 o3 = decode[i] & 0x7F;
4921 i3 = 128*mlevel2[i2] + o3;
4922 mlevel3[i3] = i;
4923 }
4924 return result;
4925}
4926
4927static int
4928encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4929{
4930 struct encoding_map *map = (struct encoding_map*)mapping;
4931 int l1 = c>>11;
4932 int l2 = (c>>7) & 0xF;
4933 int l3 = c & 0x7F;
4934 int i;
4935
4936#ifdef Py_UNICODE_WIDE
4937 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004939 }
4940#endif
4941 if (c == 0)
4942 return 0;
4943 /* level 1*/
4944 i = map->level1[l1];
4945 if (i == 0xFF) {
4946 return -1;
4947 }
4948 /* level 2*/
4949 i = map->level23[16*i+l2];
4950 if (i == 0xFF) {
4951 return -1;
4952 }
4953 /* level 3 */
4954 i = map->level23[16*map->count2 + 128*i + l3];
4955 if (i == 0) {
4956 return -1;
4957 }
4958 return i;
4959}
4960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961/* Lookup the character ch in the mapping. If the character
4962 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004963 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965{
Christian Heimes217cfd12007-12-02 14:31:20 +00004966 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 PyObject *x;
4968
4969 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 x = PyObject_GetItem(mapping, w);
4972 Py_DECREF(w);
4973 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4975 /* No mapping found means: mapping is undefined. */
4976 PyErr_Clear();
4977 x = Py_None;
4978 Py_INCREF(x);
4979 return x;
4980 } else
4981 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004983 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004985 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 long value = PyLong_AS_LONG(x);
4987 if (value < 0 || value > 255) {
4988 PyErr_SetString(PyExc_TypeError,
4989 "character mapping must be in range(256)");
4990 Py_DECREF(x);
4991 return NULL;
4992 }
4993 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004995 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* wrong return value */
4999 PyErr_Format(PyExc_TypeError,
5000 "character mapping must return integer, bytes or None, not %.400s",
5001 x->ob_type->tp_name);
5002 Py_DECREF(x);
5003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 }
5005}
5006
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005008charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005009{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005010 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5011 /* exponentially overallocate to minimize reallocations */
5012 if (requiredsize < 2*outsize)
5013 requiredsize = 2*outsize;
5014 if (_PyBytes_Resize(outobj, requiredsize))
5015 return -1;
5016 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005017}
5018
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005021}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005023 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 space is available. Return a new reference to the object that
5025 was put in the output buffer, or Py_None, if the mapping was undefined
5026 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005027 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005029charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005032 PyObject *rep;
5033 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005034 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035
Christian Heimes90aa7642007-12-19 02:45:37 +00005036 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005037 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005039 if (res == -1)
5040 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 if (outsize<requiredsize)
5042 if (charmapencode_resize(outobj, outpos, requiredsize))
5043 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005044 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 outstart[(*outpos)++] = (char)res;
5046 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005047 }
5048
5049 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005052 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 Py_DECREF(rep);
5054 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005055 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 if (PyLong_Check(rep)) {
5057 Py_ssize_t requiredsize = *outpos+1;
5058 if (outsize<requiredsize)
5059 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5060 Py_DECREF(rep);
5061 return enc_EXCEPTION;
5062 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005063 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 else {
5067 const char *repchars = PyBytes_AS_STRING(rep);
5068 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5069 Py_ssize_t requiredsize = *outpos+repsize;
5070 if (outsize<requiredsize)
5071 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5072 Py_DECREF(rep);
5073 return enc_EXCEPTION;
5074 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005075 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 memcpy(outstart + *outpos, repchars, repsize);
5077 *outpos += repsize;
5078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005080 Py_DECREF(rep);
5081 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082}
5083
5084/* handle an error in PyUnicode_EncodeCharmap
5085 Return 0 on success, -1 on error */
5086static
5087int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005088 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005090 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005091 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092{
5093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t repsize;
5095 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 Py_UNICODE *uni2;
5097 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005098 Py_ssize_t collstartpos = *inpos;
5099 Py_ssize_t collendpos = *inpos+1;
5100 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101 char *encoding = "charmap";
5102 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005103 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 /* find all unencodable characters */
5106 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005107 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005108 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 int res = encoding_map_lookup(p[collendpos], mapping);
5110 if (res != -1)
5111 break;
5112 ++collendpos;
5113 continue;
5114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005115
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 rep = charmapencode_lookup(p[collendpos], mapping);
5117 if (rep==NULL)
5118 return -1;
5119 else if (rep!=Py_None) {
5120 Py_DECREF(rep);
5121 break;
5122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005123 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 }
5126 /* cache callback name lookup
5127 * (if not done yet, i.e. it's the first error) */
5128 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if ((errors==NULL) || (!strcmp(errors, "strict")))
5130 *known_errorHandler = 1;
5131 else if (!strcmp(errors, "replace"))
5132 *known_errorHandler = 2;
5133 else if (!strcmp(errors, "ignore"))
5134 *known_errorHandler = 3;
5135 else if (!strcmp(errors, "xmlcharrefreplace"))
5136 *known_errorHandler = 4;
5137 else
5138 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 }
5140 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005141 case 1: /* strict */
5142 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5143 return -1;
5144 case 2: /* replace */
5145 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 x = charmapencode_output('?', mapping, res, respos);
5147 if (x==enc_EXCEPTION) {
5148 return -1;
5149 }
5150 else if (x==enc_FAILED) {
5151 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5152 return -1;
5153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005154 }
5155 /* fall through */
5156 case 3: /* ignore */
5157 *inpos = collendpos;
5158 break;
5159 case 4: /* xmlcharrefreplace */
5160 /* generate replacement (temporarily (mis)uses p) */
5161 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 char buffer[2+29+1+1];
5163 char *cp;
5164 sprintf(buffer, "&#%d;", (int)p[collpos]);
5165 for (cp = buffer; *cp; ++cp) {
5166 x = charmapencode_output(*cp, mapping, res, respos);
5167 if (x==enc_EXCEPTION)
5168 return -1;
5169 else if (x==enc_FAILED) {
5170 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5171 return -1;
5172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173 }
5174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005175 *inpos = collendpos;
5176 break;
5177 default:
5178 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 encoding, reason, p, size, exceptionObject,
5180 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005181 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005183 if (PyBytes_Check(repunicode)) {
5184 /* Directly copy bytes result to output. */
5185 Py_ssize_t outsize = PyBytes_Size(*res);
5186 Py_ssize_t requiredsize;
5187 repsize = PyBytes_Size(repunicode);
5188 requiredsize = *respos + repsize;
5189 if (requiredsize > outsize)
5190 /* Make room for all additional bytes. */
5191 if (charmapencode_resize(res, respos, requiredsize)) {
5192 Py_DECREF(repunicode);
5193 return -1;
5194 }
5195 memcpy(PyBytes_AsString(*res) + *respos,
5196 PyBytes_AsString(repunicode), repsize);
5197 *respos += repsize;
5198 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005199 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005200 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005202 /* generate replacement */
5203 repsize = PyUnicode_GET_SIZE(repunicode);
5204 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 x = charmapencode_output(*uni2, mapping, res, respos);
5206 if (x==enc_EXCEPTION) {
5207 return -1;
5208 }
5209 else if (x==enc_FAILED) {
5210 Py_DECREF(repunicode);
5211 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5212 return -1;
5213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005214 }
5215 *inpos = newpos;
5216 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 }
5218 return 0;
5219}
5220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 PyObject *mapping,
5224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 /* output object */
5227 PyObject *res = NULL;
5228 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232 PyObject *errorHandler = NULL;
5233 PyObject *exc = NULL;
5234 /* the following variable is used for caching string comparisons
5235 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5236 * 3=ignore, 4=xmlcharrefreplace */
5237 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
5239 /* Default to Latin-1 */
5240 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 /* allocate enough for a simple encoding without
5244 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005245 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 if (res == NULL)
5247 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005248 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 /* try to encode it */
5253 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5254 if (x==enc_EXCEPTION) /* error */
5255 goto onError;
5256 if (x==enc_FAILED) { /* unencodable character */
5257 if (charmap_encoding_error(p, size, &inpos, mapping,
5258 &exc,
5259 &known_errorHandler, &errorHandler, errors,
5260 &res, &respos)) {
5261 goto onError;
5262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005263 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 else
5265 /* done with this character => adjust input position */
5266 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005270 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005271 if (_PyBytes_Resize(&res, respos) < 0)
5272 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274 Py_XDECREF(exc);
5275 Py_XDECREF(errorHandler);
5276 return res;
5277
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 Py_XDECREF(res);
5280 Py_XDECREF(exc);
5281 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 return NULL;
5283}
5284
5285PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
5288 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 PyErr_BadArgument();
5290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 }
5292 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 PyUnicode_GET_SIZE(unicode),
5294 mapping,
5295 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296}
5297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298/* create or adjust a UnicodeTranslateError */
5299static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 const Py_UNICODE *unicode, Py_ssize_t size,
5301 Py_ssize_t startpos, Py_ssize_t endpos,
5302 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
5308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5310 goto onError;
5311 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5312 goto onError;
5313 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5314 goto onError;
5315 return;
5316 onError:
5317 Py_DECREF(*exceptionObject);
5318 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
5320}
5321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322/* raises a UnicodeTranslateError */
5323static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 const Py_UNICODE *unicode, Py_ssize_t size,
5325 Py_ssize_t startpos, Py_ssize_t endpos,
5326 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327{
5328 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332}
5333
5334/* error handling callback helper:
5335 build arguments, call the callback and check the arguments,
5336 put the result into newpos and return the replacement string, which
5337 has to be freed by the caller */
5338static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 PyObject **errorHandler,
5340 const char *reason,
5341 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5342 Py_ssize_t startpos, Py_ssize_t endpos,
5343 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005345 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005347 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 PyObject *restuple;
5349 PyObject *resunicode;
5350
5351 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 }
5356
5357 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361
5362 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005367 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 Py_DECREF(restuple);
5369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 }
5371 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 &resunicode, &i_newpos)) {
5373 Py_DECREF(restuple);
5374 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005378 else
5379 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005380 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5382 Py_DECREF(restuple);
5383 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 Py_INCREF(resunicode);
5386 Py_DECREF(restuple);
5387 return resunicode;
5388}
5389
5390/* Lookup the character ch in the mapping and put the result in result,
5391 which must be decrefed by the caller.
5392 Return 0 on success, -1 on error */
5393static
5394int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5395{
Christian Heimes217cfd12007-12-02 14:31:20 +00005396 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 PyObject *x;
5398
5399 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 x = PyObject_GetItem(mapping, w);
5402 Py_DECREF(w);
5403 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5405 /* No mapping found means: use 1:1 mapping. */
5406 PyErr_Clear();
5407 *result = NULL;
5408 return 0;
5409 } else
5410 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 }
5412 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *result = x;
5414 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005416 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 long value = PyLong_AS_LONG(x);
5418 long max = PyUnicode_GetMax();
5419 if (value < 0 || value > max) {
5420 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005421 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 Py_DECREF(x);
5423 return -1;
5424 }
5425 *result = x;
5426 return 0;
5427 }
5428 else if (PyUnicode_Check(x)) {
5429 *result = x;
5430 return 0;
5431 }
5432 else {
5433 /* wrong return value */
5434 PyErr_SetString(PyExc_TypeError,
5435 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005436 Py_DECREF(x);
5437 return -1;
5438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439}
5440/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 if not reallocate and adjust various state variables.
5442 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443static
Walter Dörwald4894c302003-10-24 14:25:28 +00005444int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005447 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005448 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 /* remember old output position */
5450 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5451 /* exponentially overallocate to minimize reallocations */
5452 if (requiredsize < 2 * oldsize)
5453 requiredsize = 2 * oldsize;
5454 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5455 return -1;
5456 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 }
5458 return 0;
5459}
5460/* lookup the character, put the result in the output string and adjust
5461 various state variables. Return a new reference to the object that
5462 was put in the output buffer in *result, or Py_None, if the mapping was
5463 undefined (in which case no character was written).
5464 The called must decref result.
5465 Return 0 on success, -1 on error. */
5466static
Walter Dörwald4894c302003-10-24 14:25:28 +00005467int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5469 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470{
Walter Dörwald4894c302003-10-24 14:25:28 +00005471 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 /* not found => default to 1:1 mapping */
5475 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 }
5477 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005479 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 /* no overflow check, because we know that the space is enough */
5481 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 }
5483 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5485 if (repsize==1) {
5486 /* no overflow check, because we know that the space is enough */
5487 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5488 }
5489 else if (repsize!=0) {
5490 /* more than one character */
5491 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5492 (insize - (curinp-startinp)) +
5493 repsize - 1;
5494 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5495 return -1;
5496 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5497 *outp += repsize;
5498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 }
5500 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 return 0;
5503}
5504
5505PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 Py_ssize_t size,
5507 PyObject *mapping,
5508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 /* output object */
5511 PyObject *res = NULL;
5512 /* pointers to the beginning and end+1 of input */
5513 const Py_UNICODE *startp = p;
5514 const Py_UNICODE *endp = p + size;
5515 /* pointer into the output */
5516 Py_UNICODE *str;
5517 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 char *reason = "character maps to <undefined>";
5520 PyObject *errorHandler = NULL;
5521 PyObject *exc = NULL;
5522 /* the following variable is used for caching string comparisons
5523 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5524 * 3=ignore, 4=xmlcharrefreplace */
5525 int known_errorHandler = -1;
5526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 PyErr_BadArgument();
5529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531
5532 /* allocate enough for a simple 1:1 translation without
5533 replacements, if we need more, we'll resize */
5534 res = PyUnicode_FromUnicode(NULL, size);
5535 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 /* try to encode it */
5543 PyObject *x = NULL;
5544 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5545 Py_XDECREF(x);
5546 goto onError;
5547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005548 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 if (x!=Py_None) /* it worked => adjust input pointer */
5550 ++p;
5551 else { /* untranslatable character */
5552 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5553 Py_ssize_t repsize;
5554 Py_ssize_t newpos;
5555 Py_UNICODE *uni2;
5556 /* startpos for collecting untranslatable chars */
5557 const Py_UNICODE *collstart = p;
5558 const Py_UNICODE *collend = p+1;
5559 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 /* find all untranslatable characters */
5562 while (collend < endp) {
5563 if (charmaptranslate_lookup(*collend, mapping, &x))
5564 goto onError;
5565 Py_XDECREF(x);
5566 if (x!=Py_None)
5567 break;
5568 ++collend;
5569 }
5570 /* cache callback name lookup
5571 * (if not done yet, i.e. it's the first error) */
5572 if (known_errorHandler==-1) {
5573 if ((errors==NULL) || (!strcmp(errors, "strict")))
5574 known_errorHandler = 1;
5575 else if (!strcmp(errors, "replace"))
5576 known_errorHandler = 2;
5577 else if (!strcmp(errors, "ignore"))
5578 known_errorHandler = 3;
5579 else if (!strcmp(errors, "xmlcharrefreplace"))
5580 known_errorHandler = 4;
5581 else
5582 known_errorHandler = 0;
5583 }
5584 switch (known_errorHandler) {
5585 case 1: /* strict */
5586 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005587 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 case 2: /* replace */
5589 /* No need to check for space, this is a 1:1 replacement */
5590 for (coll = collstart; coll<collend; ++coll)
5591 *str++ = '?';
5592 /* fall through */
5593 case 3: /* ignore */
5594 p = collend;
5595 break;
5596 case 4: /* xmlcharrefreplace */
5597 /* generate replacement (temporarily (mis)uses p) */
5598 for (p = collstart; p < collend; ++p) {
5599 char buffer[2+29+1+1];
5600 char *cp;
5601 sprintf(buffer, "&#%d;", (int)*p);
5602 if (charmaptranslate_makespace(&res, &str,
5603 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5604 goto onError;
5605 for (cp = buffer; *cp; ++cp)
5606 *str++ = *cp;
5607 }
5608 p = collend;
5609 break;
5610 default:
5611 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5612 reason, startp, size, &exc,
5613 collstart-startp, collend-startp, &newpos);
5614 if (repunicode == NULL)
5615 goto onError;
5616 /* generate replacement */
5617 repsize = PyUnicode_GET_SIZE(repunicode);
5618 if (charmaptranslate_makespace(&res, &str,
5619 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5620 Py_DECREF(repunicode);
5621 goto onError;
5622 }
5623 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5624 *str++ = *uni2;
5625 p = startp + newpos;
5626 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005628 }
5629 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 /* Resize if we allocated to much */
5631 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005632 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 if (PyUnicode_Resize(&res, respos) < 0)
5634 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 }
5636 Py_XDECREF(exc);
5637 Py_XDECREF(errorHandler);
5638 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 Py_XDECREF(res);
5642 Py_XDECREF(exc);
5643 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return NULL;
5645}
5646
5647PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 PyObject *mapping,
5649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650{
5651 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 str = PyUnicode_FromObject(str);
5654 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 PyUnicode_GET_SIZE(str),
5658 mapping,
5659 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 Py_DECREF(str);
5661 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005662
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 Py_XDECREF(str);
5665 return NULL;
5666}
Tim Petersced69f82003-09-16 20:30:58 +00005667
Guido van Rossum9e896b32000-04-05 20:11:21 +00005668/* --- Decimal Encoder ---------------------------------------------------- */
5669
5670int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 Py_ssize_t length,
5672 char *output,
5673 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005674{
5675 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 PyObject *errorHandler = NULL;
5677 PyObject *exc = NULL;
5678 const char *encoding = "decimal";
5679 const char *reason = "invalid decimal Unicode string";
5680 /* the following variable is used for caching string comparisons
5681 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5682 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005683
5684 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 PyErr_BadArgument();
5686 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005687 }
5688
5689 p = s;
5690 end = s + length;
5691 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 register Py_UNICODE ch = *p;
5693 int decimal;
5694 PyObject *repunicode;
5695 Py_ssize_t repsize;
5696 Py_ssize_t newpos;
5697 Py_UNICODE *uni2;
5698 Py_UNICODE *collstart;
5699 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005702 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 ++p;
5704 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 decimal = Py_UNICODE_TODECIMAL(ch);
5707 if (decimal >= 0) {
5708 *output++ = '0' + decimal;
5709 ++p;
5710 continue;
5711 }
5712 if (0 < ch && ch < 256) {
5713 *output++ = (char)ch;
5714 ++p;
5715 continue;
5716 }
5717 /* All other characters are considered unencodable */
5718 collstart = p;
5719 collend = p+1;
5720 while (collend < end) {
5721 if ((0 < *collend && *collend < 256) ||
5722 !Py_UNICODE_ISSPACE(*collend) ||
5723 Py_UNICODE_TODECIMAL(*collend))
5724 break;
5725 }
5726 /* cache callback name lookup
5727 * (if not done yet, i.e. it's the first error) */
5728 if (known_errorHandler==-1) {
5729 if ((errors==NULL) || (!strcmp(errors, "strict")))
5730 known_errorHandler = 1;
5731 else if (!strcmp(errors, "replace"))
5732 known_errorHandler = 2;
5733 else if (!strcmp(errors, "ignore"))
5734 known_errorHandler = 3;
5735 else if (!strcmp(errors, "xmlcharrefreplace"))
5736 known_errorHandler = 4;
5737 else
5738 known_errorHandler = 0;
5739 }
5740 switch (known_errorHandler) {
5741 case 1: /* strict */
5742 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5743 goto onError;
5744 case 2: /* replace */
5745 for (p = collstart; p < collend; ++p)
5746 *output++ = '?';
5747 /* fall through */
5748 case 3: /* ignore */
5749 p = collend;
5750 break;
5751 case 4: /* xmlcharrefreplace */
5752 /* generate replacement (temporarily (mis)uses p) */
5753 for (p = collstart; p < collend; ++p)
5754 output += sprintf(output, "&#%d;", (int)*p);
5755 p = collend;
5756 break;
5757 default:
5758 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5759 encoding, reason, s, length, &exc,
5760 collstart-s, collend-s, &newpos);
5761 if (repunicode == NULL)
5762 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005763 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005764 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005765 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5766 Py_DECREF(repunicode);
5767 goto onError;
5768 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 /* generate replacement */
5770 repsize = PyUnicode_GET_SIZE(repunicode);
5771 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5772 Py_UNICODE ch = *uni2;
5773 if (Py_UNICODE_ISSPACE(ch))
5774 *output++ = ' ';
5775 else {
5776 decimal = Py_UNICODE_TODECIMAL(ch);
5777 if (decimal >= 0)
5778 *output++ = '0' + decimal;
5779 else if (0 < ch && ch < 256)
5780 *output++ = (char)ch;
5781 else {
5782 Py_DECREF(repunicode);
5783 raise_encode_exception(&exc, encoding,
5784 s, length, collstart-s, collend-s, reason);
5785 goto onError;
5786 }
5787 }
5788 }
5789 p = s + newpos;
5790 Py_DECREF(repunicode);
5791 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005792 }
5793 /* 0-terminate the output string */
5794 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_XDECREF(exc);
5796 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005797 return 0;
5798
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 Py_XDECREF(exc);
5801 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005802 return -1;
5803}
5804
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805/* --- Helpers ------------------------------------------------------------ */
5806
Eric Smith8c663262007-08-25 02:26:07 +00005807#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005808#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005809#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005810/* Include _ParseTupleFinds from find.h */
5811#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005812#include "stringlib/find.h"
5813#include "stringlib/partition.h"
5814
Eric Smith5807c412008-05-11 21:00:57 +00005815#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005816#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005817#include "stringlib/localeutil.h"
5818
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819/* helper macro to fixup start/end slice values */
5820#define FIX_START_END(obj) \
5821 if (start < 0) \
5822 start += (obj)->length; \
5823 if (start < 0) \
5824 start = 0; \
5825 if (end > (obj)->length) \
5826 end = (obj)->length; \
5827 if (end < 0) \
5828 end += (obj)->length; \
5829 if (end < 0) \
5830 end = 0;
5831
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 PyObject *substr,
5834 Py_ssize_t start,
5835 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005838 PyUnicodeObject* str_obj;
5839 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005840
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5842 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5845 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 Py_DECREF(str_obj);
5847 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
Tim Petersced69f82003-09-16 20:30:58 +00005849
Thomas Wouters477c8d52006-05-27 19:21:47 +00005850 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005851
Thomas Wouters477c8d52006-05-27 19:21:47 +00005852 result = stringlib_count(
5853 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5854 );
5855
5856 Py_DECREF(sub_obj);
5857 Py_DECREF(str_obj);
5858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return result;
5860}
5861
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 PyObject *sub,
5864 Py_ssize_t start,
5865 Py_ssize_t end,
5866 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005869
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005873 sub = PyUnicode_FromObject(sub);
5874 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 Py_DECREF(str);
5876 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 }
Tim Petersced69f82003-09-16 20:30:58 +00005878
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879 if (direction > 0)
5880 result = stringlib_find_slice(
5881 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5882 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5883 start, end
5884 );
5885 else
5886 result = stringlib_rfind_slice(
5887 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5888 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5889 start, end
5890 );
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893 Py_DECREF(sub);
5894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return result;
5896}
5897
Tim Petersced69f82003-09-16 20:30:58 +00005898static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 PyUnicodeObject *substring,
5901 Py_ssize_t start,
5902 Py_ssize_t end,
5903 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 if (substring->length == 0)
5906 return 1;
5907
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910 end -= substring->length;
5911 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
5914 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 if (Py_UNICODE_MATCH(self, end, substring))
5916 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 } else {
5918 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 }
5921
5922 return 0;
5923}
5924
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 PyObject *substr,
5927 Py_ssize_t start,
5928 Py_ssize_t end,
5929 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005931 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 str = PyUnicode_FromObject(str);
5934 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 substr = PyUnicode_FromObject(substr);
5937 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 Py_DECREF(str);
5939 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Tim Petersced69f82003-09-16 20:30:58 +00005941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 (PyUnicodeObject *)substr,
5944 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 Py_DECREF(str);
5946 Py_DECREF(substr);
5947 return result;
5948}
5949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950/* Apply fixfct filter to the Unicode object self and return a
5951 reference to the modified object */
5952
Tim Petersced69f82003-09-16 20:30:58 +00005953static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957
5958 PyUnicodeObject *u;
5959
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005960 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005963
5964 Py_UNICODE_COPY(u->str, self->str, self->length);
5965
Tim Peters7a29bd52001-09-12 03:03:31 +00005966 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 /* fixfct should return TRUE if it modified the buffer. If
5968 FALSE, return a reference to the original buffer instead
5969 (to save space, not time) */
5970 Py_INCREF(self);
5971 Py_DECREF(u);
5972 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
5974 return (PyObject*) u;
5975}
5976
Tim Petersced69f82003-09-16 20:30:58 +00005977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978int fixupper(PyUnicodeObject *self)
5979{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005980 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 Py_UNICODE *s = self->str;
5982 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005986
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 ch = Py_UNICODE_TOUPPER(*s);
5988 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 *s = ch;
5991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 s++;
5993 }
5994
5995 return status;
5996}
5997
Tim Petersced69f82003-09-16 20:30:58 +00005998static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999int fixlower(PyUnicodeObject *self)
6000{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 Py_UNICODE *s = self->str;
6003 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 ch = Py_UNICODE_TOLOWER(*s);
6009 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 *s = ch;
6012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 s++;
6014 }
6015
6016 return status;
6017}
6018
Tim Petersced69f82003-09-16 20:30:58 +00006019static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020int fixswapcase(PyUnicodeObject *self)
6021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 Py_UNICODE *s = self->str;
6024 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 while (len-- > 0) {
6027 if (Py_UNICODE_ISUPPER(*s)) {
6028 *s = Py_UNICODE_TOLOWER(*s);
6029 status = 1;
6030 } else if (Py_UNICODE_ISLOWER(*s)) {
6031 *s = Py_UNICODE_TOUPPER(*s);
6032 status = 1;
6033 }
6034 s++;
6035 }
6036
6037 return status;
6038}
6039
Tim Petersced69f82003-09-16 20:30:58 +00006040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041int fixcapitalize(PyUnicodeObject *self)
6042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006044 Py_UNICODE *s = self->str;
6045 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006046
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006047 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006049 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 *s = Py_UNICODE_TOUPPER(*s);
6051 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006053 s++;
6054 while (--len > 0) {
6055 if (Py_UNICODE_ISUPPER(*s)) {
6056 *s = Py_UNICODE_TOLOWER(*s);
6057 status = 1;
6058 }
6059 s++;
6060 }
6061 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062}
6063
6064static
6065int fixtitle(PyUnicodeObject *self)
6066{
6067 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6068 register Py_UNICODE *e;
6069 int previous_is_cased;
6070
6071 /* Shortcut for single character strings */
6072 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6074 if (*p != ch) {
6075 *p = ch;
6076 return 1;
6077 }
6078 else
6079 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Tim Petersced69f82003-09-16 20:30:58 +00006081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 e = p + PyUnicode_GET_SIZE(self);
6083 previous_is_cased = 0;
6084 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 if (previous_is_cased)
6088 *p = Py_UNICODE_TOLOWER(ch);
6089 else
6090 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006091
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 if (Py_UNICODE_ISLOWER(ch) ||
6093 Py_UNICODE_ISUPPER(ch) ||
6094 Py_UNICODE_ISTITLE(ch))
6095 previous_is_cased = 1;
6096 else
6097 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
6099 return 1;
6100}
6101
Tim Peters8ce9f162004-08-27 01:49:32 +00006102PyObject *
6103PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104{
Skip Montanaro6543b452004-09-16 03:28:13 +00006105 const Py_UNICODE blank = ' ';
6106 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006107 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006108 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006109 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6110 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006111 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6112 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006113 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006114 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Tim Peters05eba1f2004-08-27 21:32:02 +00006116 fseq = PySequence_Fast(seq, "");
6117 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006118 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006119 }
6120
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006121 /* NOTE: the following code can't call back into Python code,
6122 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006123 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006124
Tim Peters05eba1f2004-08-27 21:32:02 +00006125 seqlen = PySequence_Fast_GET_SIZE(fseq);
6126 /* If empty sequence, return u"". */
6127 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006128 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6129 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006130 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006131 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006132 /* If singleton sequence with an exact Unicode, return that. */
6133 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 item = items[0];
6135 if (PyUnicode_CheckExact(item)) {
6136 Py_INCREF(item);
6137 res = (PyUnicodeObject *)item;
6138 goto Done;
6139 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006140 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006141 else {
6142 /* Set up sep and seplen */
6143 if (separator == NULL) {
6144 sep = &blank;
6145 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006146 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006147 else {
6148 if (!PyUnicode_Check(separator)) {
6149 PyErr_Format(PyExc_TypeError,
6150 "separator: expected str instance,"
6151 " %.80s found",
6152 Py_TYPE(separator)->tp_name);
6153 goto onError;
6154 }
6155 sep = PyUnicode_AS_UNICODE(separator);
6156 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006157 }
6158 }
6159
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006160 /* There are at least two things to join, or else we have a subclass
6161 * of str in the sequence.
6162 * Do a pre-pass to figure out the total amount of space we'll
6163 * need (sz), and see whether all argument are strings.
6164 */
6165 sz = 0;
6166 for (i = 0; i < seqlen; i++) {
6167 const Py_ssize_t old_sz = sz;
6168 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 if (!PyUnicode_Check(item)) {
6170 PyErr_Format(PyExc_TypeError,
6171 "sequence item %zd: expected str instance,"
6172 " %.80s found",
6173 i, Py_TYPE(item)->tp_name);
6174 goto onError;
6175 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006176 sz += PyUnicode_GET_SIZE(item);
6177 if (i != 0)
6178 sz += seplen;
6179 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6180 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006182 goto onError;
6183 }
6184 }
Tim Petersced69f82003-09-16 20:30:58 +00006185
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006186 res = _PyUnicode_New(sz);
6187 if (res == NULL)
6188 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006189
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006190 /* Catenate everything. */
6191 res_p = PyUnicode_AS_UNICODE(res);
6192 for (i = 0; i < seqlen; ++i) {
6193 Py_ssize_t itemlen;
6194 item = items[i];
6195 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 /* Copy item, and maybe the separator. */
6197 if (i) {
6198 Py_UNICODE_COPY(res_p, sep, seplen);
6199 res_p += seplen;
6200 }
6201 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6202 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006203 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006206 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 return (PyObject *)res;
6208
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006210 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006211 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return NULL;
6213}
6214
Tim Petersced69f82003-09-16 20:30:58 +00006215static
6216PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 Py_ssize_t left,
6218 Py_ssize_t right,
6219 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220{
6221 PyUnicodeObject *u;
6222
6223 if (left < 0)
6224 left = 0;
6225 if (right < 0)
6226 right = 0;
6227
Tim Peters7a29bd52001-09-12 03:03:31 +00006228 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 Py_INCREF(self);
6230 return self;
6231 }
6232
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006233 if (left > PY_SSIZE_T_MAX - self->length ||
6234 right > PY_SSIZE_T_MAX - (left + self->length)) {
6235 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6236 return NULL;
6237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 u = _PyUnicode_New(left + self->length + right);
6239 if (u) {
6240 if (left)
6241 Py_UNICODE_FILL(u->str, fill, left);
6242 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6243 if (right)
6244 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6245 }
6246
6247 return u;
6248}
6249
Benjamin Peterson29060642009-01-31 22:14:21 +00006250#define SPLIT_APPEND(data, left, right) \
6251 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6252 if (!str) \
6253 goto onError; \
6254 if (PyList_Append(list, str)) { \
6255 Py_DECREF(str); \
6256 goto onError; \
6257 } \
6258 else \
6259 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static
6262PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 PyObject *list,
6264 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006266 register Py_ssize_t i;
6267 register Py_ssize_t j;
6268 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006270 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006276 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6278 i++;
6279 if (j < i) {
6280 if (maxcount-- <= 0)
6281 break;
6282 SPLIT_APPEND(buf, j, i);
6283 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6284 i++;
6285 j = i;
6286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 }
6291 return list;
6292
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_DECREF(list);
6295 return NULL;
6296}
6297
6298PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006301 register Py_ssize_t i;
6302 register Py_ssize_t j;
6303 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 PyObject *list;
6305 PyObject *str;
6306 Py_UNICODE *data;
6307
6308 string = PyUnicode_FromObject(string);
6309 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 data = PyUnicode_AS_UNICODE(string);
6312 len = PyUnicode_GET_SIZE(string);
6313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 list = PyList_New(0);
6315 if (!list)
6316 goto onError;
6317
6318 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 /* Find a line and append it */
6322 while (i < len && !BLOOM_LINEBREAK(data[i]))
6323 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 if (i < len) {
6328 if (data[i] == '\r' && i + 1 < len &&
6329 data[i+1] == '\n')
6330 i += 2;
6331 else
6332 i++;
6333 if (keepends)
6334 eol = i;
6335 }
6336 SPLIT_APPEND(data, j, eol);
6337 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 }
6339 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 }
6342
6343 Py_DECREF(string);
6344 return list;
6345
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006347 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 Py_DECREF(string);
6349 return NULL;
6350}
6351
Tim Petersced69f82003-09-16 20:30:58 +00006352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 PyObject *list,
6355 Py_UNICODE ch,
6356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 register Py_ssize_t i;
6359 register Py_ssize_t j;
6360 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006362 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 if (buf[i] == ch) {
6366 if (maxcount-- <= 0)
6367 break;
6368 SPLIT_APPEND(buf, j, i);
6369 i = j = i + 1;
6370 } else
6371 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
6373 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
6376 return list;
6377
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 Py_DECREF(list);
6380 return NULL;
6381}
6382
Tim Petersced69f82003-09-16 20:30:58 +00006383static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 PyObject *list,
6386 PyUnicodeObject *substring,
6387 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 register Py_ssize_t i;
6390 register Py_ssize_t j;
6391 Py_ssize_t len = self->length;
6392 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 PyObject *str;
6394
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006395 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (Py_UNICODE_MATCH(self, i, substring)) {
6397 if (maxcount-- <= 0)
6398 break;
6399 SPLIT_APPEND(self->str, j, i);
6400 i = j = i + sublen;
6401 } else
6402 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 }
6404 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407 return list;
6408
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 Py_DECREF(list);
6411 return NULL;
6412}
6413
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006414static
6415PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 PyObject *list,
6417 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006418{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006419 register Py_ssize_t i;
6420 register Py_ssize_t j;
6421 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006423 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006424
6425 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006427 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006429 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6431 i--;
6432 if (j > i) {
6433 if (maxcount-- <= 0)
6434 break;
6435 SPLIT_APPEND(buf, i + 1, j + 1);
6436 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6437 i--;
6438 j = i;
6439 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006440 }
6441 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006444 if (PyList_Reverse(list) < 0)
6445 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446 return list;
6447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006449 Py_DECREF(list);
6450 return NULL;
6451}
6452
Benjamin Peterson14339b62009-01-31 16:36:08 +00006453static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006454PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 PyObject *list,
6456 Py_UNICODE ch,
6457 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006459 register Py_ssize_t i;
6460 register Py_ssize_t j;
6461 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006462 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006463 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006464
6465 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 if (buf[i] == ch) {
6467 if (maxcount-- <= 0)
6468 break;
6469 SPLIT_APPEND(buf, i + 1, j + 1);
6470 j = i = i - 1;
6471 } else
6472 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006473 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006474 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006476 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (PyList_Reverse(list) < 0)
6478 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479 return list;
6480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006482 Py_DECREF(list);
6483 return NULL;
6484}
6485
Benjamin Peterson14339b62009-01-31 16:36:08 +00006486static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006487PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 PyObject *list,
6489 PyUnicodeObject *substring,
6490 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006491{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 register Py_ssize_t i;
6493 register Py_ssize_t j;
6494 Py_ssize_t len = self->length;
6495 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006496 PyObject *str;
6497
6498 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 if (Py_UNICODE_MATCH(self, i, substring)) {
6500 if (maxcount-- <= 0)
6501 break;
6502 SPLIT_APPEND(self->str, i + sublen, j);
6503 j = i;
6504 i -= sublen;
6505 } else
6506 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006507 }
6508 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 if (PyList_Reverse(list) < 0)
6512 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006513 return list;
6514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006516 Py_DECREF(list);
6517 return NULL;
6518}
6519
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520#undef SPLIT_APPEND
6521
6522static
6523PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 PyUnicodeObject *substring,
6525 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
6527 PyObject *list;
6528
6529 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006530 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
6532 list = PyList_New(0);
6533 if (!list)
6534 return NULL;
6535
6536 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541
6542 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 Py_DECREF(list);
6544 PyErr_SetString(PyExc_ValueError, "empty separator");
6545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 }
6547 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
Tim Petersced69f82003-09-16 20:30:58 +00006551static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006552PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 PyUnicodeObject *substring,
6554 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006555{
6556 PyObject *list;
6557
6558 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006559 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006560
6561 list = PyList_New(0);
6562 if (!list)
6563 return NULL;
6564
6565 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006567
6568 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006570
6571 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 Py_DECREF(list);
6573 PyErr_SetString(PyExc_ValueError, "empty separator");
6574 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006575 }
6576 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006578}
6579
6580static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 PyUnicodeObject *str1,
6583 PyUnicodeObject *str2,
6584 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
6586 PyUnicodeObject *u;
6587
6588 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 if (str1->length == str2->length) {
6592 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006593 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006594 if (str1->length == 1) {
6595 /* replace characters */
6596 Py_UNICODE u1, u2;
6597 if (!findchar(self->str, self->length, str1->str[0]))
6598 goto nothing;
6599 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6600 if (!u)
6601 return NULL;
6602 Py_UNICODE_COPY(u->str, self->str, self->length);
6603 u1 = str1->str[0];
6604 u2 = str2->str[0];
6605 for (i = 0; i < u->length; i++)
6606 if (u->str[i] == u1) {
6607 if (--maxcount < 0)
6608 break;
6609 u->str[i] = u2;
6610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006612 i = fastsearch(
6613 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006615 if (i < 0)
6616 goto nothing;
6617 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6618 if (!u)
6619 return NULL;
6620 Py_UNICODE_COPY(u->str, self->str, self->length);
6621 while (i <= self->length - str1->length)
6622 if (Py_UNICODE_MATCH(self, i, str1)) {
6623 if (--maxcount < 0)
6624 break;
6625 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6626 i += str1->length;
6627 } else
6628 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631
6632 Py_ssize_t n, i, j, e;
6633 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_UNICODE *p;
6635
6636 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006637 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 if (n > maxcount)
6639 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006640 if (n == 0)
6641 goto nothing;
6642 /* new_size = self->length + n * (str2->length - str1->length)); */
6643 delta = (str2->length - str1->length);
6644 if (delta == 0) {
6645 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006647 product = n * (str2->length - str1->length);
6648 if ((product / (str2->length - str1->length)) != n) {
6649 PyErr_SetString(PyExc_OverflowError,
6650 "replace string is too long");
6651 return NULL;
6652 }
6653 new_size = self->length + product;
6654 if (new_size < 0) {
6655 PyErr_SetString(PyExc_OverflowError,
6656 "replace string is too long");
6657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 }
6659 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006660 u = _PyUnicode_New(new_size);
6661 if (!u)
6662 return NULL;
6663 i = 0;
6664 p = u->str;
6665 e = self->length - str1->length;
6666 if (str1->length > 0) {
6667 while (n-- > 0) {
6668 /* look for next match */
6669 j = i;
6670 while (j <= e) {
6671 if (Py_UNICODE_MATCH(self, j, str1))
6672 break;
6673 j++;
6674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 if (j > e)
6677 break;
6678 /* copy unchanged part [i:j] */
6679 Py_UNICODE_COPY(p, self->str+i, j-i);
6680 p += j - i;
6681 }
6682 /* copy substitution string */
6683 if (str2->length > 0) {
6684 Py_UNICODE_COPY(p, str2->str, str2->length);
6685 p += str2->length;
6686 }
6687 i = j + str1->length;
6688 }
6689 if (i < self->length)
6690 /* copy tail [i:] */
6691 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6692 } else {
6693 /* interleave */
6694 while (n > 0) {
6695 Py_UNICODE_COPY(p, str2->str, str2->length);
6696 p += str2->length;
6697 if (--n <= 0)
6698 break;
6699 *p++ = self->str[i++];
6700 }
6701 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006707 /* nothing to replace; return original string (when possible) */
6708 if (PyUnicode_CheckExact(self)) {
6709 Py_INCREF(self);
6710 return (PyObject *) self;
6711 }
6712 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
6715/* --- Unicode Object Methods --------------------------------------------- */
6716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006717PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719\n\
6720Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
6723static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006724unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 return fixup(self, fixtitle);
6727}
6728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731\n\
6732Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006736unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return fixup(self, fixcapitalize);
6739}
6740
6741#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744\n\
6745Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
6751 PyObject *list;
6752 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 /* Split into words */
6756 list = split(self, NULL, -1);
6757 if (!list)
6758 return NULL;
6759
6760 /* Capitalize each word */
6761 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6762 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 if (item == NULL)
6765 goto onError;
6766 Py_DECREF(PyList_GET_ITEM(list, i));
6767 PyList_SET_ITEM(list, i, item);
6768 }
6769
6770 /* Join the words to form a new string */
6771 item = PyUnicode_Join(NULL, list);
6772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 Py_DECREF(list);
6775 return (PyObject *)item;
6776}
6777#endif
6778
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006779/* Argument converter. Coerces to a single unicode character */
6780
6781static int
6782convert_uc(PyObject *obj, void *addr)
6783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6785 PyObject *uniobj;
6786 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006787
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788 uniobj = PyUnicode_FromObject(obj);
6789 if (uniobj == NULL) {
6790 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 return 0;
6793 }
6794 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6795 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006797 Py_DECREF(uniobj);
6798 return 0;
6799 }
6800 unistr = PyUnicode_AS_UNICODE(uniobj);
6801 *fillcharloc = unistr[0];
6802 Py_DECREF(uniobj);
6803 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006809Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006810done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject *
6813unicode_center(PyUnicodeObject *self, PyObject *args)
6814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t marg, left;
6816 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006817 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Thomas Woutersde017742006-02-16 19:34:37 +00006819 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 return NULL;
6821
Tim Peters7a29bd52001-09-12 03:03:31 +00006822 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 Py_INCREF(self);
6824 return (PyObject*) self;
6825 }
6826
6827 marg = width - self->length;
6828 left = marg / 2 + (marg & width & 1);
6829
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006830 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Marc-André Lemburge5034372000-08-08 08:04:29 +00006833#if 0
6834
6835/* This code should go into some future Unicode collation support
6836 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006837 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006838
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006839/* speedy UTF-16 code point order comparison */
6840/* gleaned from: */
6841/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006843static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006844{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006845 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006846 0, 0, 0, 0, 0, 0, 0, 0,
6847 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006848 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006849};
6850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851static int
6852unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 Py_UNICODE *s1 = str1->str;
6857 Py_UNICODE *s2 = str2->str;
6858
6859 len1 = str1->length;
6860 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006861
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006863 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006864
6865 c1 = *s1++;
6866 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006867
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 if (c1 > (1<<11) * 26)
6869 c1 += utf16Fixup[c1>>11];
6870 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006871 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006872 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006873
6874 if (c1 != c2)
6875 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006876
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006877 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 }
6879
6880 return (len1 < len2) ? -1 : (len1 != len2);
6881}
6882
Marc-André Lemburge5034372000-08-08 08:04:29 +00006883#else
6884
6885static int
6886unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006888 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006889
6890 Py_UNICODE *s1 = str1->str;
6891 Py_UNICODE *s2 = str2->str;
6892
6893 len1 = str1->length;
6894 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006895
Marc-André Lemburge5034372000-08-08 08:04:29 +00006896 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006897 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006898
Fredrik Lundh45714e92001-06-26 16:39:36 +00006899 c1 = *s1++;
6900 c2 = *s2++;
6901
6902 if (c1 != c2)
6903 return (c1 < c2) ? -1 : 1;
6904
Marc-André Lemburge5034372000-08-08 08:04:29 +00006905 len1--; len2--;
6906 }
6907
6908 return (len1 < len2) ? -1 : (len1 != len2);
6909}
6910
6911#endif
6912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006916 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6917 return unicode_compare((PyUnicodeObject *)left,
6918 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006919 PyErr_Format(PyExc_TypeError,
6920 "Can't compare %.100s and %.100s",
6921 left->ob_type->tp_name,
6922 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 return -1;
6924}
6925
Martin v. Löwis5b222132007-06-10 09:51:05 +00006926int
6927PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6928{
6929 int i;
6930 Py_UNICODE *id;
6931 assert(PyUnicode_Check(uni));
6932 id = PyUnicode_AS_UNICODE(uni);
6933 /* Compare Unicode string and source character set string */
6934 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 if (id[i] != str[i])
6936 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Petersonbb81c8c2010-01-09 21:54:39 +00006937 /* This check keeps Python strings that end in '\0' from comparing equal
6938 to C strings identical up to that point. */
6939 if (PyUnicode_GET_SIZE(uni) != i)
6940 /* We'll say the Python string is longer. */
6941 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006942 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006944 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006946 return 0;
6947}
6948
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006949
Benjamin Peterson29060642009-01-31 22:14:21 +00006950#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006951 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006952
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006953PyObject *PyUnicode_RichCompare(PyObject *left,
6954 PyObject *right,
6955 int op)
6956{
6957 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006958
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006959 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6960 PyObject *v;
6961 if (((PyUnicodeObject *) left)->length !=
6962 ((PyUnicodeObject *) right)->length) {
6963 if (op == Py_EQ) {
6964 Py_INCREF(Py_False);
6965 return Py_False;
6966 }
6967 if (op == Py_NE) {
6968 Py_INCREF(Py_True);
6969 return Py_True;
6970 }
6971 }
6972 if (left == right)
6973 result = 0;
6974 else
6975 result = unicode_compare((PyUnicodeObject *)left,
6976 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006977
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006978 /* Convert the return value to a Boolean */
6979 switch (op) {
6980 case Py_EQ:
6981 v = TEST_COND(result == 0);
6982 break;
6983 case Py_NE:
6984 v = TEST_COND(result != 0);
6985 break;
6986 case Py_LE:
6987 v = TEST_COND(result <= 0);
6988 break;
6989 case Py_GE:
6990 v = TEST_COND(result >= 0);
6991 break;
6992 case Py_LT:
6993 v = TEST_COND(result == -1);
6994 break;
6995 case Py_GT:
6996 v = TEST_COND(result == 1);
6997 break;
6998 default:
6999 PyErr_BadArgument();
7000 return NULL;
7001 }
7002 Py_INCREF(v);
7003 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007004 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007006 Py_INCREF(Py_NotImplemented);
7007 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007008}
7009
Guido van Rossum403d68b2000-03-13 15:55:09 +00007010int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007012{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007014 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007015
7016 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 sub = PyUnicode_FromObject(element);
7018 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 PyErr_Format(PyExc_TypeError,
7020 "'in <string>' requires string as left operand, not %s",
7021 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007023 }
7024
Thomas Wouters477c8d52006-05-27 19:21:47 +00007025 str = PyUnicode_FromObject(container);
7026 if (!str) {
7027 Py_DECREF(sub);
7028 return -1;
7029 }
7030
7031 result = stringlib_contains_obj(str, sub);
7032
7033 Py_DECREF(str);
7034 Py_DECREF(sub);
7035
Guido van Rossum403d68b2000-03-13 15:55:09 +00007036 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007037}
7038
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039/* Concat to string or Unicode object giving a new Unicode object. */
7040
7041PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 PyUnicodeObject *u = NULL, *v = NULL, *w;
7045
7046 /* Coerce the two arguments */
7047 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7048 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7051 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054 /* Shortcuts */
7055 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 Py_DECREF(v);
7057 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 }
7059 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 Py_DECREF(u);
7061 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 }
7063
7064 /* Concat the two Unicode strings */
7065 w = _PyUnicode_New(u->length + v->length);
7066 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 Py_UNICODE_COPY(w->str, u->str, u->length);
7069 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7070
7071 Py_DECREF(u);
7072 Py_DECREF(v);
7073 return (PyObject *)w;
7074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 Py_XDECREF(u);
7077 Py_XDECREF(v);
7078 return NULL;
7079}
7080
Walter Dörwald1ab83302007-05-18 17:15:44 +00007081void
7082PyUnicode_Append(PyObject **pleft, PyObject *right)
7083{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 PyObject *new;
7085 if (*pleft == NULL)
7086 return;
7087 if (right == NULL || !PyUnicode_Check(*pleft)) {
7088 Py_DECREF(*pleft);
7089 *pleft = NULL;
7090 return;
7091 }
7092 new = PyUnicode_Concat(*pleft, right);
7093 Py_DECREF(*pleft);
7094 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007095}
7096
7097void
7098PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7099{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007100 PyUnicode_Append(pleft, right);
7101 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007107Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007108string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007109interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111static PyObject *
7112unicode_count(PyUnicodeObject *self, PyObject *args)
7113{
7114 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007115 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007116 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 PyObject *result;
7118
Guido van Rossumb8872e62000-05-09 14:14:27 +00007119 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 return NULL;
7122
7123 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007127
Thomas Wouters477c8d52006-05-27 19:21:47 +00007128 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
Christian Heimes217cfd12007-12-02 14:31:20 +00007130 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007131 stringlib_count(self->str + start, end - start,
7132 substring->str, substring->length)
7133 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134
7135 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 return result;
7138}
7139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007143Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007144to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007145handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7147'xmlcharrefreplace' as well as any other name registered with\n\
7148codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
7151unicode_encode(PyUnicodeObject *self, PyObject *args)
7152{
7153 char *encoding = NULL;
7154 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007155 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007156
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7158 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007159 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007160 if (v == NULL)
7161 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007162 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007163 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007164 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007165 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007166 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007167 Py_DECREF(v);
7168 return NULL;
7169 }
7170 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007171
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007173 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178\n\
7179Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182static PyObject*
7183unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7184{
7185 Py_UNICODE *e;
7186 Py_UNICODE *p;
7187 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007188 Py_UNICODE *qe;
7189 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 PyUnicodeObject *u;
7191 int tabsize = 8;
7192
7193 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195
Thomas Wouters7e474022000-07-16 12:04:32 +00007196 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007197 i = 0; /* chars up to and including most recent \n or \r */
7198 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7199 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 for (p = self->str; p < e; p++)
7201 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (tabsize > 0) {
7203 incr = tabsize - (j % tabsize); /* cannot overflow */
7204 if (j > PY_SSIZE_T_MAX - incr)
7205 goto overflow1;
7206 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007207 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 if (j > PY_SSIZE_T_MAX - 1)
7211 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 j++;
7213 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 if (i > PY_SSIZE_T_MAX - j)
7215 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007217 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 }
7219 }
7220
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007221 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007223
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 /* Second pass: create output string and fill it */
7225 u = _PyUnicode_New(i + j);
7226 if (!u)
7227 return NULL;
7228
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007229 j = 0; /* same as in first pass */
7230 q = u->str; /* next output char */
7231 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232
7233 for (p = self->str; p < e; p++)
7234 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (tabsize > 0) {
7236 i = tabsize - (j % tabsize);
7237 j += i;
7238 while (i--) {
7239 if (q >= qe)
7240 goto overflow2;
7241 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007242 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007244 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 else {
7246 if (q >= qe)
7247 goto overflow2;
7248 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007249 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 if (*p == '\n' || *p == '\r')
7251 j = 0;
7252 }
7253
7254 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007255
7256 overflow2:
7257 Py_DECREF(u);
7258 overflow1:
7259 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261}
7262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007263PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265\n\
7266Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007267such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268arguments start and end are interpreted as in slice notation.\n\
7269\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007270Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272static PyObject *
7273unicode_find(PyUnicodeObject *self, PyObject *args)
7274{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007275 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007276 Py_ssize_t start;
7277 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007278 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
Christian Heimes9cd17752007-11-18 19:35:23 +00007280 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
Thomas Wouters477c8d52006-05-27 19:21:47 +00007283 result = stringlib_find_slice(
7284 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7285 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7286 start, end
7287 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
7289 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007290
Christian Heimes217cfd12007-12-02 14:31:20 +00007291 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292}
7293
7294static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007295unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296{
7297 if (index < 0 || index >= self->length) {
7298 PyErr_SetString(PyExc_IndexError, "string index out of range");
7299 return NULL;
7300 }
7301
7302 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7303}
7304
Guido van Rossumc2504932007-09-18 19:42:40 +00007305/* Believe it or not, this produces the same value for ASCII strings
7306 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007308unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309{
Guido van Rossumc2504932007-09-18 19:42:40 +00007310 Py_ssize_t len;
7311 Py_UNICODE *p;
7312 long x;
7313
7314 if (self->hash != -1)
7315 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007316 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007317 p = self->str;
7318 x = *p << 7;
7319 while (--len >= 0)
7320 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007321 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007322 if (x == -1)
7323 x = -2;
7324 self->hash = x;
7325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326}
7327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007328PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007331Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332
7333static PyObject *
7334unicode_index(PyUnicodeObject *self, PyObject *args)
7335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007336 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007337 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007338 Py_ssize_t start;
7339 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
Christian Heimes9cd17752007-11-18 19:35:23 +00007341 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
Thomas Wouters477c8d52006-05-27 19:21:47 +00007344 result = stringlib_find_slice(
7345 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7346 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7347 start, end
7348 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
7350 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 if (result < 0) {
7353 PyErr_SetString(PyExc_ValueError, "substring not found");
7354 return NULL;
7355 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007356
Christian Heimes217cfd12007-12-02 14:31:20 +00007357 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358}
7359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007360PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007363Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007364at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365
7366static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007367unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368{
7369 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7370 register const Py_UNICODE *e;
7371 int cased;
7372
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 /* Shortcut for single character strings */
7374 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007377 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007378 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 e = p + PyUnicode_GET_SIZE(self);
7382 cased = 0;
7383 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007385
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7387 return PyBool_FromLong(0);
7388 else if (!cased && Py_UNICODE_ISLOWER(ch))
7389 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007391 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392}
7393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007394PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007397Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007398at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
7400static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007401unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402{
7403 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7404 register const Py_UNICODE *e;
7405 int cased;
7406
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 /* Shortcut for single character strings */
7408 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007411 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007412 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007414
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 e = p + PyUnicode_GET_SIZE(self);
7416 cased = 0;
7417 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007419
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7421 return PyBool_FromLong(0);
7422 else if (!cased && Py_UNICODE_ISUPPER(ch))
7423 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007425 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426}
7427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007431Return True if S is a titlecased string and there is at least one\n\
7432character in S, i.e. upper- and titlecase characters may only\n\
7433follow uncased characters and lowercase characters only cased ones.\n\
7434Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435
7436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007437unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438{
7439 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7440 register const Py_UNICODE *e;
7441 int cased, previous_is_cased;
7442
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 /* Shortcut for single character strings */
7444 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7446 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007448 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007449 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007451
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 e = p + PyUnicode_GET_SIZE(self);
7453 cased = 0;
7454 previous_is_cased = 0;
7455 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007457
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7459 if (previous_is_cased)
7460 return PyBool_FromLong(0);
7461 previous_is_cased = 1;
7462 cased = 1;
7463 }
7464 else if (Py_UNICODE_ISLOWER(ch)) {
7465 if (!previous_is_cased)
7466 return PyBool_FromLong(0);
7467 previous_is_cased = 1;
7468 cased = 1;
7469 }
7470 else
7471 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007473 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474}
7475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007476PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007479Return True if all characters in S are whitespace\n\
7480and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
7482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007483unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484{
7485 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7486 register const Py_UNICODE *e;
7487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 /* Shortcut for single character strings */
7489 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 Py_UNICODE_ISSPACE(*p))
7491 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007493 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007494 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007496
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 e = p + PyUnicode_GET_SIZE(self);
7498 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 if (!Py_UNICODE_ISSPACE(*p))
7500 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007502 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503}
7504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007505PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007507\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007508Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007509and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007510
7511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007512unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007513{
7514 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7515 register const Py_UNICODE *e;
7516
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007517 /* Shortcut for single character strings */
7518 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 Py_UNICODE_ISALPHA(*p))
7520 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007521
7522 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007523 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007525
7526 e = p + PyUnicode_GET_SIZE(self);
7527 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 if (!Py_UNICODE_ISALPHA(*p))
7529 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007531 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007537Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007539
7540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007541unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007542{
7543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7544 register const Py_UNICODE *e;
7545
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007546 /* Shortcut for single character strings */
7547 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 Py_UNICODE_ISALNUM(*p))
7549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007550
7551 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007552 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007554
7555 e = p + PyUnicode_GET_SIZE(self);
7556 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 if (!Py_UNICODE_ISALNUM(*p))
7558 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007560 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007561}
7562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007566Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
7569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007570unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571{
7572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7573 register const Py_UNICODE *e;
7574
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 /* Shortcut for single character strings */
7576 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 Py_UNICODE_ISDECIMAL(*p))
7578 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007581 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 e = p + PyUnicode_GET_SIZE(self);
7585 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 if (!Py_UNICODE_ISDECIMAL(*p))
7587 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007589 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590}
7591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007592PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007595Return True if all characters in S are digits\n\
7596and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
7598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007599unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600{
7601 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7602 register const Py_UNICODE *e;
7603
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 /* Shortcut for single character strings */
7605 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 Py_UNICODE_ISDIGIT(*p))
7607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007609 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007610 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007612
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 e = p + PyUnicode_GET_SIZE(self);
7614 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 if (!Py_UNICODE_ISDIGIT(*p))
7616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619}
7620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007624Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007628unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629{
7630 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7631 register const Py_UNICODE *e;
7632
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 /* Shortcut for single character strings */
7634 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 Py_UNICODE_ISNUMERIC(*p))
7636 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007639 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007641
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 e = p + PyUnicode_GET_SIZE(self);
7643 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 if (!Py_UNICODE_ISNUMERIC(*p))
7645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007647 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648}
7649
Martin v. Löwis47383402007-08-15 07:32:56 +00007650int
7651PyUnicode_IsIdentifier(PyObject *self)
7652{
7653 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7654 register const Py_UNICODE *e;
7655
7656 /* Special case for empty strings */
7657 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007659
7660 /* PEP 3131 says that the first character must be in
7661 XID_Start and subsequent characters in XID_Continue,
7662 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007664 letters, digits, underscore). However, given the current
7665 definition of XID_Start and XID_Continue, it is sufficient
7666 to check just for these, except that _ must be allowed
7667 as starting an identifier. */
7668 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7669 return 0;
7670
7671 e = p + PyUnicode_GET_SIZE(self);
7672 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (!_PyUnicode_IsXidContinue(*p))
7674 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007675 }
7676 return 1;
7677}
7678
7679PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007681\n\
7682Return True if S is a valid identifier according\n\
7683to the language definition.");
7684
7685static PyObject*
7686unicode_isidentifier(PyObject *self)
7687{
7688 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7689}
7690
Georg Brandl559e5d72008-06-11 18:37:52 +00007691PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007693\n\
7694Return True if all characters in S are considered\n\
7695printable in repr() or S is empty, False otherwise.");
7696
7697static PyObject*
7698unicode_isprintable(PyObject *self)
7699{
7700 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7701 register const Py_UNICODE *e;
7702
7703 /* Shortcut for single character strings */
7704 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7705 Py_RETURN_TRUE;
7706 }
7707
7708 e = p + PyUnicode_GET_SIZE(self);
7709 for (; p < e; p++) {
7710 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7711 Py_RETURN_FALSE;
7712 }
7713 }
7714 Py_RETURN_TRUE;
7715}
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(join__doc__,
Georg Brandl628e6f92009-10-27 20:24:45 +00007718 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719\n\
7720Return a string which is the concatenation of the strings in the\n\
Georg Brandl628e6f92009-10-27 20:24:45 +00007721iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722
7723static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007724unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007726 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727}
7728
Martin v. Löwis18e16552006-02-15 17:27:45 +00007729static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730unicode_length(PyUnicodeObject *self)
7731{
7732 return self->length;
7733}
7734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007738Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007739done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
7741static PyObject *
7742unicode_ljust(PyUnicodeObject *self, PyObject *args)
7743{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007744 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007745 Py_UNICODE fillchar = ' ';
7746
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007747 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 return NULL;
7749
Tim Peters7a29bd52001-09-12 03:03:31 +00007750 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 Py_INCREF(self);
7752 return (PyObject*) self;
7753 }
7754
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007755 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756}
7757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007761Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762
7763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007764unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 return fixup(self, fixlower);
7767}
7768
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769#define LEFTSTRIP 0
7770#define RIGHTSTRIP 1
7771#define BOTHSTRIP 2
7772
7773/* Arrays indexed by above */
7774static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7775
7776#define STRIPNAME(i) (stripformat[i]+3)
7777
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007778/* externally visible for str.strip(unicode) */
7779PyObject *
7780_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7781{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7783 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7784 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7785 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7786 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007787
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007789
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 i = 0;
7791 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7793 i++;
7794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007795 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007796
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 j = len;
7798 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 do {
7800 j--;
7801 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7802 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007803 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007804
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 Py_INCREF(self);
7807 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007808 }
7809 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811}
7812
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
7814static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007815do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7818 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007819
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 i = 0;
7821 if (striptype != RIGHTSTRIP) {
7822 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7823 i++;
7824 }
7825 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007826
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 j = len;
7828 if (striptype != LEFTSTRIP) {
7829 do {
7830 j--;
7831 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7832 j++;
7833 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007834
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7836 Py_INCREF(self);
7837 return (PyObject*)self;
7838 }
7839 else
7840 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841}
7842
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843
7844static PyObject *
7845do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7846{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007848
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7850 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 if (sep != NULL && sep != Py_None) {
7853 if (PyUnicode_Check(sep))
7854 return _PyUnicode_XStrip(self, striptype, sep);
7855 else {
7856 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 "%s arg must be None or str",
7858 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 return NULL;
7860 }
7861 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007862
Benjamin Peterson14339b62009-01-31 16:36:08 +00007863 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007864}
7865
7866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007867PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007869\n\
7870Return a copy of the string S with leading and trailing\n\
7871whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007872If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007873
7874static PyObject *
7875unicode_strip(PyUnicodeObject *self, PyObject *args)
7876{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 if (PyTuple_GET_SIZE(args) == 0)
7878 return do_strip(self, BOTHSTRIP); /* Common case */
7879 else
7880 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007881}
7882
7883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007884PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007886\n\
7887Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007888If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007889
7890static PyObject *
7891unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7892{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007893 if (PyTuple_GET_SIZE(args) == 0)
7894 return do_strip(self, LEFTSTRIP); /* Common case */
7895 else
7896 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007897}
7898
7899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007900PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007902\n\
7903Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007904If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905
7906static PyObject *
7907unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7908{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007909 if (PyTuple_GET_SIZE(args) == 0)
7910 return do_strip(self, RIGHTSTRIP); /* Common case */
7911 else
7912 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007913}
7914
7915
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007917unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918{
7919 PyUnicodeObject *u;
7920 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007921 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007922 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923
Georg Brandl222de0f2009-04-12 12:01:50 +00007924 if (len < 1) {
7925 Py_INCREF(unicode_empty);
7926 return (PyObject *)unicode_empty;
7927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928
Tim Peters7a29bd52001-09-12 03:03:31 +00007929 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 /* no repeat, return original string */
7931 Py_INCREF(str);
7932 return (PyObject*) str;
7933 }
Tim Peters8f422462000-09-09 06:13:41 +00007934
7935 /* ensure # of chars needed doesn't overflow int and # of bytes
7936 * needed doesn't overflow size_t
7937 */
7938 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007939 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007940 PyErr_SetString(PyExc_OverflowError,
7941 "repeated string is too long");
7942 return NULL;
7943 }
7944 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7945 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7946 PyErr_SetString(PyExc_OverflowError,
7947 "repeated string is too long");
7948 return NULL;
7949 }
7950 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 if (!u)
7952 return NULL;
7953
7954 p = u->str;
7955
Georg Brandl222de0f2009-04-12 12:01:50 +00007956 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007957 Py_UNICODE_FILL(p, str->str[0], len);
7958 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007959 Py_ssize_t done = str->length; /* number of characters copied this far */
7960 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007962 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007963 Py_UNICODE_COPY(p+done, p, n);
7964 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 }
7967
7968 return (PyObject*) u;
7969}
7970
7971PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 PyObject *subobj,
7973 PyObject *replobj,
7974 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975{
7976 PyObject *self;
7977 PyObject *str1;
7978 PyObject *str2;
7979 PyObject *result;
7980
7981 self = PyUnicode_FromObject(obj);
7982 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 str1 = PyUnicode_FromObject(subobj);
7985 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 Py_DECREF(self);
7987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 }
7989 str2 = PyUnicode_FromObject(replobj);
7990 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_DECREF(self);
7992 Py_DECREF(str1);
7993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 }
Tim Petersced69f82003-09-16 20:30:58 +00007995 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 (PyUnicodeObject *)str1,
7997 (PyUnicodeObject *)str2,
7998 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 Py_DECREF(self);
8000 Py_DECREF(str1);
8001 Py_DECREF(str2);
8002 return result;
8003}
8004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008005PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007\n\
8008Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008009old replaced by new. If the optional argument count is\n\
8010given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011
8012static PyObject*
8013unicode_replace(PyUnicodeObject *self, PyObject *args)
8014{
8015 PyUnicodeObject *str1;
8016 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008017 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 PyObject *result;
8019
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 return NULL;
8022 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8023 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008026 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 Py_DECREF(str1);
8028 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
8031 result = replace(self, str1, str2, maxcount);
8032
8033 Py_DECREF(str1);
8034 Py_DECREF(str2);
8035 return result;
8036}
8037
8038static
8039PyObject *unicode_repr(PyObject *unicode)
8040{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008041 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008042 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008043 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8044 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8045
8046 /* XXX(nnorwitz): rather than over-allocating, it would be
8047 better to choose a different scheme. Perhaps scan the
8048 first N-chars of the string and allocate based on that size.
8049 */
8050 /* Initial allocation is based on the longest-possible unichr
8051 escape.
8052
8053 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8054 unichr, so in this case it's the longest unichr escape. In
8055 narrow (UTF-16) builds this is five chars per source unichr
8056 since there are two unichrs in the surrogate pair, so in narrow
8057 (UTF-16) builds it's not the longest unichr escape.
8058
8059 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8060 so in the narrow (UTF-16) build case it's the longest unichr
8061 escape.
8062 */
8063
Walter Dörwald1ab83302007-05-18 17:15:44 +00008064 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008066#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008068#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008070#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008072 if (repr == NULL)
8073 return NULL;
8074
Walter Dörwald1ab83302007-05-18 17:15:44 +00008075 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008076
8077 /* Add quote */
8078 *p++ = (findchar(s, size, '\'') &&
8079 !findchar(s, size, '"')) ? '"' : '\'';
8080 while (size-- > 0) {
8081 Py_UNICODE ch = *s++;
8082
8083 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008084 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008085 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008086 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008087 continue;
8088 }
8089
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008091 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008092 *p++ = '\\';
8093 *p++ = 't';
8094 }
8095 else if (ch == '\n') {
8096 *p++ = '\\';
8097 *p++ = 'n';
8098 }
8099 else if (ch == '\r') {
8100 *p++ = '\\';
8101 *p++ = 'r';
8102 }
8103
8104 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008105 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008106 *p++ = '\\';
8107 *p++ = 'x';
8108 *p++ = hexdigits[(ch >> 4) & 0x000F];
8109 *p++ = hexdigits[ch & 0x000F];
8110 }
8111
Georg Brandl559e5d72008-06-11 18:37:52 +00008112 /* Copy ASCII characters as-is */
8113 else if (ch < 0x7F) {
8114 *p++ = ch;
8115 }
8116
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008118 else {
8119 Py_UCS4 ucs = ch;
8120
8121#ifndef Py_UNICODE_WIDE
8122 Py_UNICODE ch2 = 0;
8123 /* Get code point from surrogate pair */
8124 if (size > 0) {
8125 ch2 = *s;
8126 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008131 size--;
8132 }
8133 }
8134#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008136 (categories Z* and C* except ASCII space)
8137 */
8138 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8139 /* Map 8-bit characters to '\xhh' */
8140 if (ucs <= 0xff) {
8141 *p++ = '\\';
8142 *p++ = 'x';
8143 *p++ = hexdigits[(ch >> 4) & 0x000F];
8144 *p++ = hexdigits[ch & 0x000F];
8145 }
8146 /* Map 21-bit characters to '\U00xxxxxx' */
8147 else if (ucs >= 0x10000) {
8148 *p++ = '\\';
8149 *p++ = 'U';
8150 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8151 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8152 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8153 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8154 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8155 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8156 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8157 *p++ = hexdigits[ucs & 0x0000000F];
8158 }
8159 /* Map 16-bit characters to '\uxxxx' */
8160 else {
8161 *p++ = '\\';
8162 *p++ = 'u';
8163 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8164 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8165 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8166 *p++ = hexdigits[ucs & 0x000F];
8167 }
8168 }
8169 /* Copy characters as-is */
8170 else {
8171 *p++ = ch;
8172#ifndef Py_UNICODE_WIDE
8173 if (ucs >= 0x10000)
8174 *p++ = ch2;
8175#endif
8176 }
8177 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008178 }
8179 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008180 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008181
8182 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008183 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185}
8186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008187PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189\n\
8190Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008191such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192arguments start and end are interpreted as in slice notation.\n\
8193\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008194Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
8196static PyObject *
8197unicode_rfind(PyUnicodeObject *self, PyObject *args)
8198{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008200 Py_ssize_t start;
8201 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008202 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203
Christian Heimes9cd17752007-11-18 19:35:23 +00008204 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206
Thomas Wouters477c8d52006-05-27 19:21:47 +00008207 result = stringlib_rfind_slice(
8208 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8209 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8210 start, end
8211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
8213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008214
Christian Heimes217cfd12007-12-02 14:31:20 +00008215 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216}
8217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008218PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008221Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222
8223static PyObject *
8224unicode_rindex(PyUnicodeObject *self, PyObject *args)
8225{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008226 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008227 Py_ssize_t start;
8228 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
Christian Heimes9cd17752007-11-18 19:35:23 +00008231 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
Thomas Wouters477c8d52006-05-27 19:21:47 +00008234 result = stringlib_rfind_slice(
8235 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8236 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8237 start, end
8238 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
8240 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 if (result < 0) {
8243 PyErr_SetString(PyExc_ValueError, "substring not found");
8244 return NULL;
8245 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008246 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247}
8248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008249PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008252Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008253done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254
8255static PyObject *
8256unicode_rjust(PyUnicodeObject *self, PyObject *args)
8257{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008258 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008259 Py_UNICODE fillchar = ' ';
8260
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008261 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 return NULL;
8263
Tim Peters7a29bd52001-09-12 03:03:31 +00008264 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 Py_INCREF(self);
8266 return (PyObject*) self;
8267 }
8268
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008269 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270}
8271
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 PyObject *sep,
8274 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275{
8276 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008277
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 s = PyUnicode_FromObject(s);
8279 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 if (sep != NULL) {
8282 sep = PyUnicode_FromObject(sep);
8283 if (sep == NULL) {
8284 Py_DECREF(s);
8285 return NULL;
8286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
8288
8289 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8290
8291 Py_DECREF(s);
8292 Py_XDECREF(sep);
8293 return result;
8294}
8295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008296PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298\n\
8299Return a list of the words in S, using sep as the\n\
8300delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008301splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008302whitespace string is a separator and empty strings are\n\
8303removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
8305static PyObject*
8306unicode_split(PyUnicodeObject *self, PyObject *args)
8307{
8308 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008309 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
Martin v. Löwis18e16552006-02-15 17:27:45 +00008311 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 return NULL;
8313
8314 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320}
8321
Thomas Wouters477c8d52006-05-27 19:21:47 +00008322PyObject *
8323PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8324{
8325 PyObject* str_obj;
8326 PyObject* sep_obj;
8327 PyObject* out;
8328
8329 str_obj = PyUnicode_FromObject(str_in);
8330 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008332 sep_obj = PyUnicode_FromObject(sep_in);
8333 if (!sep_obj) {
8334 Py_DECREF(str_obj);
8335 return NULL;
8336 }
8337
8338 out = stringlib_partition(
8339 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8340 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8341 );
8342
8343 Py_DECREF(sep_obj);
8344 Py_DECREF(str_obj);
8345
8346 return out;
8347}
8348
8349
8350PyObject *
8351PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8352{
8353 PyObject* str_obj;
8354 PyObject* sep_obj;
8355 PyObject* out;
8356
8357 str_obj = PyUnicode_FromObject(str_in);
8358 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008360 sep_obj = PyUnicode_FromObject(sep_in);
8361 if (!sep_obj) {
8362 Py_DECREF(str_obj);
8363 return NULL;
8364 }
8365
8366 out = stringlib_rpartition(
8367 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8368 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8369 );
8370
8371 Py_DECREF(sep_obj);
8372 Py_DECREF(str_obj);
8373
8374 return out;
8375}
8376
8377PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008380Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008381the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008382found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008383
8384static PyObject*
8385unicode_partition(PyUnicodeObject *self, PyObject *separator)
8386{
8387 return PyUnicode_Partition((PyObject *)self, separator);
8388}
8389
8390PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti4c81fbb2010-01-25 12:02:24 +00008391 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008393Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008395separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396
8397static PyObject*
8398unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8399{
8400 return PyUnicode_RPartition((PyObject *)self, separator);
8401}
8402
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008403PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 PyObject *sep,
8405 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008406{
8407 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008409 s = PyUnicode_FromObject(s);
8410 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 if (sep != NULL) {
8413 sep = PyUnicode_FromObject(sep);
8414 if (sep == NULL) {
8415 Py_DECREF(s);
8416 return NULL;
8417 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008418 }
8419
8420 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8421
8422 Py_DECREF(s);
8423 Py_XDECREF(sep);
8424 return result;
8425}
8426
8427PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008429\n\
8430Return a list of the words in S, using sep as the\n\
8431delimiter string, starting at the end of the string and\n\
8432working to the front. If maxsplit is given, at most maxsplit\n\
8433splits are done. If sep is not specified, any whitespace string\n\
8434is a separator.");
8435
8436static PyObject*
8437unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8438{
8439 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008440 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008441
Martin v. Löwis18e16552006-02-15 17:27:45 +00008442 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008443 return NULL;
8444
8445 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008447 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008449 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008451}
8452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008453PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455\n\
8456Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008457Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008458is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459
8460static PyObject*
8461unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8462{
Guido van Rossum86662912000-04-11 15:38:46 +00008463 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464
Guido van Rossum86662912000-04-11 15:38:46 +00008465 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 return NULL;
8467
Guido van Rossum86662912000-04-11 15:38:46 +00008468 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469}
8470
8471static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008472PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473{
Walter Dörwald346737f2007-05-31 10:44:43 +00008474 if (PyUnicode_CheckExact(self)) {
8475 Py_INCREF(self);
8476 return self;
8477 } else
8478 /* Subtype -- return genuine unicode string with the same value. */
8479 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8480 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481}
8482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485\n\
8486Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008487and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
8489static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008490unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 return fixup(self, fixswapcase);
8493}
8494
Georg Brandlceee0772007-11-27 23:48:05 +00008495PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008497\n\
8498Return a translation table usable for str.translate().\n\
8499If there is only one argument, it must be a dictionary mapping Unicode\n\
8500ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008501Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008502If there are two arguments, they must be strings of equal length, and\n\
8503in the resulting dictionary, each character in x will be mapped to the\n\
8504character at the same position in y. If there is a third argument, it\n\
8505must be a string, whose characters will be mapped to None in the result.");
8506
8507static PyObject*
8508unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8509{
8510 PyObject *x, *y = NULL, *z = NULL;
8511 PyObject *new = NULL, *key, *value;
8512 Py_ssize_t i = 0;
8513 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514
Georg Brandlceee0772007-11-27 23:48:05 +00008515 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8516 return NULL;
8517 new = PyDict_New();
8518 if (!new)
8519 return NULL;
8520 if (y != NULL) {
8521 /* x must be a string too, of equal length */
8522 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8523 if (!PyUnicode_Check(x)) {
8524 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8525 "be a string if there is a second argument");
8526 goto err;
8527 }
8528 if (PyUnicode_GET_SIZE(x) != ylen) {
8529 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8530 "arguments must have equal length");
8531 goto err;
8532 }
8533 /* create entries for translating chars in x to those in y */
8534 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008535 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8536 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008537 if (!key || !value)
8538 goto err;
8539 res = PyDict_SetItem(new, key, value);
8540 Py_DECREF(key);
8541 Py_DECREF(value);
8542 if (res < 0)
8543 goto err;
8544 }
8545 /* create entries for deleting chars in z */
8546 if (z != NULL) {
8547 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008548 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008549 if (!key)
8550 goto err;
8551 res = PyDict_SetItem(new, key, Py_None);
8552 Py_DECREF(key);
8553 if (res < 0)
8554 goto err;
8555 }
8556 }
8557 } else {
8558 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008559 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008560 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8561 "to maketrans it must be a dict");
8562 goto err;
8563 }
8564 /* copy entries into the new dict, converting string keys to int keys */
8565 while (PyDict_Next(x, &i, &key, &value)) {
8566 if (PyUnicode_Check(key)) {
8567 /* convert string keys to integer keys */
8568 PyObject *newkey;
8569 if (PyUnicode_GET_SIZE(key) != 1) {
8570 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8571 "table must be of length 1");
8572 goto err;
8573 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008574 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008575 if (!newkey)
8576 goto err;
8577 res = PyDict_SetItem(new, newkey, value);
8578 Py_DECREF(newkey);
8579 if (res < 0)
8580 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008581 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008582 /* just keep integer keys */
8583 if (PyDict_SetItem(new, key, value) < 0)
8584 goto err;
8585 } else {
8586 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8587 "be strings or integers");
8588 goto err;
8589 }
8590 }
8591 }
8592 return new;
8593 err:
8594 Py_DECREF(new);
8595 return NULL;
8596}
8597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008598PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600\n\
8601Return a copy of the string S, where all characters have been mapped\n\
8602through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008603Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008604Unmapped characters are left untouched. Characters mapped to None\n\
8605are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
8607static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008608unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609{
Georg Brandlceee0772007-11-27 23:48:05 +00008610 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611}
8612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008613PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008616Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
8618static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008619unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 return fixup(self, fixupper);
8622}
8623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008624PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008627Pad a numeric string S with zeros on the left, to fill a field\n\
8628of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
8630static PyObject *
8631unicode_zfill(PyUnicodeObject *self, PyObject *args)
8632{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008633 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 PyUnicodeObject *u;
8635
Martin v. Löwis18e16552006-02-15 17:27:45 +00008636 Py_ssize_t width;
8637 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 return NULL;
8639
8640 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008641 if (PyUnicode_CheckExact(self)) {
8642 Py_INCREF(self);
8643 return (PyObject*) self;
8644 }
8645 else
8646 return PyUnicode_FromUnicode(
8647 PyUnicode_AS_UNICODE(self),
8648 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 }
8651
8652 fill = width - self->length;
8653
8654 u = pad(self, fill, 0, '0');
8655
Walter Dörwald068325e2002-04-15 13:36:47 +00008656 if (u == NULL)
8657 return NULL;
8658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (u->str[fill] == '+' || u->str[fill] == '-') {
8660 /* move sign to beginning of string */
8661 u->str[0] = u->str[fill];
8662 u->str[fill] = '0';
8663 }
8664
8665 return (PyObject*) u;
8666}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667
8668#if 0
8669static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008670unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671{
Christian Heimes2202f872008-02-06 14:31:34 +00008672 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673}
8674#endif
8675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008676PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008679Return True if S starts with the specified prefix, False otherwise.\n\
8680With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008681With optional end, stop comparing S at that position.\n\
8682prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683
8684static PyObject *
8685unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008688 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008690 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008691 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008692 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008694 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8696 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008697 if (PyTuple_Check(subobj)) {
8698 Py_ssize_t i;
8699 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8700 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008702 if (substring == NULL)
8703 return NULL;
8704 result = tailmatch(self, substring, start, end, -1);
8705 Py_DECREF(substring);
8706 if (result) {
8707 Py_RETURN_TRUE;
8708 }
8709 }
8710 /* nothing matched */
8711 Py_RETURN_FALSE;
8712 }
8713 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008716 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008718 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719}
8720
8721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008722PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008725Return True if S ends with the specified suffix, False otherwise.\n\
8726With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727With optional end, stop comparing S at that position.\n\
8728suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
8730static PyObject *
8731unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008734 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008737 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008740 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8742 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008743 if (PyTuple_Check(subobj)) {
8744 Py_ssize_t i;
8745 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8746 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008748 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008750 result = tailmatch(self, substring, start, end, +1);
8751 Py_DECREF(substring);
8752 if (result) {
8753 Py_RETURN_TRUE;
8754 }
8755 }
8756 Py_RETURN_FALSE;
8757 }
8758 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008762 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008764 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765}
8766
Eric Smith8c663262007-08-25 02:26:07 +00008767#include "stringlib/string_format.h"
8768
8769PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008771\n\
8772");
8773
Eric Smith4a7d76d2008-05-30 18:10:19 +00008774static PyObject *
8775unicode__format__(PyObject* self, PyObject* args)
8776{
8777 PyObject *format_spec;
8778
8779 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8780 return NULL;
8781
8782 return _PyUnicode_FormatAdvanced(self,
8783 PyUnicode_AS_UNICODE(format_spec),
8784 PyUnicode_GET_SIZE(format_spec));
8785}
8786
Eric Smith8c663262007-08-25 02:26:07 +00008787PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008789\n\
8790");
8791
8792static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008793unicode__sizeof__(PyUnicodeObject *v)
8794{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008795 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8796 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008797}
8798
8799PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008801
8802static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008803unicode_getnewargs(PyUnicodeObject *v)
8804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008805 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008806}
8807
8808
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809static PyMethodDef unicode_methods[] = {
8810
8811 /* Order is according to common usage: often used methods should
8812 appear first, since lookup is done sequentially. */
8813
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008814 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8815 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8816 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008817 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008818 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8819 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8820 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8821 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8822 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8823 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8824 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008825 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008826 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8827 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8828 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008829 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008830 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8831 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8832 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008833 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008834 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008835 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008836 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008837 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8838 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8839 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8840 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8841 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8842 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8843 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8844 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8845 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8846 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8847 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8848 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8849 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8850 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008851 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008852 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008853 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008854 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008855 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008856 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8857 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008858 {"maketrans", (PyCFunction) unicode_maketrans,
8859 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008860 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008861#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008862 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863#endif
8864
8865#if 0
8866 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008867 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868#endif
8869
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 {NULL, NULL}
8872};
8873
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008874static PyObject *
8875unicode_mod(PyObject *v, PyObject *w)
8876{
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 if (!PyUnicode_Check(v)) {
8878 Py_INCREF(Py_NotImplemented);
8879 return Py_NotImplemented;
8880 }
8881 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008882}
8883
8884static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008885 0, /*nb_add*/
8886 0, /*nb_subtract*/
8887 0, /*nb_multiply*/
8888 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008889};
8890
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 (lenfunc) unicode_length, /* sq_length */
8893 PyUnicode_Concat, /* sq_concat */
8894 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8895 (ssizeargfunc) unicode_getitem, /* sq_item */
8896 0, /* sq_slice */
8897 0, /* sq_ass_item */
8898 0, /* sq_ass_slice */
8899 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900};
8901
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008902static PyObject*
8903unicode_subscript(PyUnicodeObject* self, PyObject* item)
8904{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008905 if (PyIndex_Check(item)) {
8906 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008907 if (i == -1 && PyErr_Occurred())
8908 return NULL;
8909 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008910 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008911 return unicode_getitem(self, i);
8912 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008913 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008914 Py_UNICODE* source_buf;
8915 Py_UNICODE* result_buf;
8916 PyObject* result;
8917
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008918 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008920 return NULL;
8921 }
8922
8923 if (slicelength <= 0) {
8924 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008925 } else if (start == 0 && step == 1 && slicelength == self->length &&
8926 PyUnicode_CheckExact(self)) {
8927 Py_INCREF(self);
8928 return (PyObject *)self;
8929 } else if (step == 1) {
8930 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008931 } else {
8932 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008933 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8934 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008935
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 if (result_buf == NULL)
8937 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008938
8939 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8940 result_buf[i] = source_buf[cur];
8941 }
Tim Petersced69f82003-09-16 20:30:58 +00008942
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008943 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008944 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008945 return result;
8946 }
8947 } else {
8948 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8949 return NULL;
8950 }
8951}
8952
8953static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008954 (lenfunc)unicode_length, /* mp_length */
8955 (binaryfunc)unicode_subscript, /* mp_subscript */
8956 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008957};
8958
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960/* Helpers for PyUnicode_Format() */
8961
8962static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008963getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008965 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 (*p_argidx)++;
8968 if (arglen < 0)
8969 return args;
8970 else
8971 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 }
8973 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 return NULL;
8976}
8977
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008978/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008980static PyObject *
8981formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008983 char *p;
8984 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008986
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 x = PyFloat_AsDouble(v);
8988 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008989 return NULL;
8990
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008993
Eric Smith0923d1d2009-04-16 20:16:10 +00008994 p = PyOS_double_to_string(x, type, prec,
8995 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008996 if (p == NULL)
8997 return NULL;
8998 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008999 PyMem_Free(p);
9000 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001}
9002
Tim Peters38fd5b62000-09-21 05:43:11 +00009003static PyObject*
9004formatlong(PyObject *val, int flags, int prec, int type)
9005{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 char *buf;
9007 int len;
9008 PyObject *str; /* temporary string object. */
9009 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009010
Benjamin Peterson14339b62009-01-31 16:36:08 +00009011 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9012 if (!str)
9013 return NULL;
9014 result = PyUnicode_FromStringAndSize(buf, len);
9015 Py_DECREF(str);
9016 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009017}
9018
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019static int
9020formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009021 size_t buflen,
9022 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009024 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009025 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 if (PyUnicode_GET_SIZE(v) == 1) {
9027 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9028 buf[1] = '\0';
9029 return 1;
9030 }
9031#ifndef Py_UNICODE_WIDE
9032 if (PyUnicode_GET_SIZE(v) == 2) {
9033 /* Decode a valid surrogate pair */
9034 int c0 = PyUnicode_AS_UNICODE(v)[0];
9035 int c1 = PyUnicode_AS_UNICODE(v)[1];
9036 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9037 0xDC00 <= c1 && c1 <= 0xDFFF) {
9038 buf[0] = c0;
9039 buf[1] = c1;
9040 buf[2] = '\0';
9041 return 2;
9042 }
9043 }
9044#endif
9045 goto onError;
9046 }
9047 else {
9048 /* Integer input truncated to a character */
9049 long x;
9050 x = PyLong_AsLong(v);
9051 if (x == -1 && PyErr_Occurred())
9052 goto onError;
9053
9054 if (x < 0 || x > 0x10ffff) {
9055 PyErr_SetString(PyExc_OverflowError,
9056 "%c arg not in range(0x110000)");
9057 return -1;
9058 }
9059
9060#ifndef Py_UNICODE_WIDE
9061 if (x > 0xffff) {
9062 x -= 0x10000;
9063 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9064 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9065 return 2;
9066 }
9067#endif
9068 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009069 buf[1] = '\0';
9070 return 1;
9071 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009072
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009074 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009076 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077}
9078
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009079/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009080 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009081*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009082#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009083
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086{
9087 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009088 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 int args_owned = 0;
9090 PyUnicodeObject *result = NULL;
9091 PyObject *dict = NULL;
9092 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009093
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 PyErr_BadInternalCall();
9096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 }
9098 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009099 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 fmt = PyUnicode_AS_UNICODE(uformat);
9102 fmtcnt = PyUnicode_GET_SIZE(uformat);
9103
9104 reslen = rescnt = fmtcnt + 100;
9105 result = _PyUnicode_New(reslen);
9106 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 res = PyUnicode_AS_UNICODE(result);
9109
9110 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 arglen = PyTuple_Size(args);
9112 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 }
9114 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 arglen = -1;
9116 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009118 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009119 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121
9122 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 if (*fmt != '%') {
9124 if (--rescnt < 0) {
9125 rescnt = fmtcnt + 100;
9126 reslen += rescnt;
9127 if (_PyUnicode_Resize(&result, reslen) < 0)
9128 goto onError;
9129 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9130 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 }
9134 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 /* Got a format specifier */
9136 int flags = 0;
9137 Py_ssize_t width = -1;
9138 int prec = -1;
9139 Py_UNICODE c = '\0';
9140 Py_UNICODE fill;
9141 int isnumok;
9142 PyObject *v = NULL;
9143 PyObject *temp = NULL;
9144 Py_UNICODE *pbuf;
9145 Py_UNICODE sign;
9146 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009147 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 fmt++;
9150 if (*fmt == '(') {
9151 Py_UNICODE *keystart;
9152 Py_ssize_t keylen;
9153 PyObject *key;
9154 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009155
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 if (dict == NULL) {
9157 PyErr_SetString(PyExc_TypeError,
9158 "format requires a mapping");
9159 goto onError;
9160 }
9161 ++fmt;
9162 --fmtcnt;
9163 keystart = fmt;
9164 /* Skip over balanced parentheses */
9165 while (pcount > 0 && --fmtcnt >= 0) {
9166 if (*fmt == ')')
9167 --pcount;
9168 else if (*fmt == '(')
9169 ++pcount;
9170 fmt++;
9171 }
9172 keylen = fmt - keystart - 1;
9173 if (fmtcnt < 0 || pcount > 0) {
9174 PyErr_SetString(PyExc_ValueError,
9175 "incomplete format key");
9176 goto onError;
9177 }
9178#if 0
9179 /* keys are converted to strings using UTF-8 and
9180 then looked up since Python uses strings to hold
9181 variables names etc. in its namespaces and we
9182 wouldn't want to break common idioms. */
9183 key = PyUnicode_EncodeUTF8(keystart,
9184 keylen,
9185 NULL);
9186#else
9187 key = PyUnicode_FromUnicode(keystart, keylen);
9188#endif
9189 if (key == NULL)
9190 goto onError;
9191 if (args_owned) {
9192 Py_DECREF(args);
9193 args_owned = 0;
9194 }
9195 args = PyObject_GetItem(dict, key);
9196 Py_DECREF(key);
9197 if (args == NULL) {
9198 goto onError;
9199 }
9200 args_owned = 1;
9201 arglen = -1;
9202 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 while (--fmtcnt >= 0) {
9205 switch (c = *fmt++) {
9206 case '-': flags |= F_LJUST; continue;
9207 case '+': flags |= F_SIGN; continue;
9208 case ' ': flags |= F_BLANK; continue;
9209 case '#': flags |= F_ALT; continue;
9210 case '0': flags |= F_ZERO; continue;
9211 }
9212 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 if (c == '*') {
9215 v = getnextarg(args, arglen, &argidx);
9216 if (v == NULL)
9217 goto onError;
9218 if (!PyLong_Check(v)) {
9219 PyErr_SetString(PyExc_TypeError,
9220 "* wants int");
9221 goto onError;
9222 }
9223 width = PyLong_AsLong(v);
9224 if (width == -1 && PyErr_Occurred())
9225 goto onError;
9226 if (width < 0) {
9227 flags |= F_LJUST;
9228 width = -width;
9229 }
9230 if (--fmtcnt >= 0)
9231 c = *fmt++;
9232 }
9233 else if (c >= '0' && c <= '9') {
9234 width = c - '0';
9235 while (--fmtcnt >= 0) {
9236 c = *fmt++;
9237 if (c < '0' || c > '9')
9238 break;
9239 if ((width*10) / 10 != width) {
9240 PyErr_SetString(PyExc_ValueError,
9241 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009242 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 }
9244 width = width*10 + (c - '0');
9245 }
9246 }
9247 if (c == '.') {
9248 prec = 0;
9249 if (--fmtcnt >= 0)
9250 c = *fmt++;
9251 if (c == '*') {
9252 v = getnextarg(args, arglen, &argidx);
9253 if (v == NULL)
9254 goto onError;
9255 if (!PyLong_Check(v)) {
9256 PyErr_SetString(PyExc_TypeError,
9257 "* wants int");
9258 goto onError;
9259 }
9260 prec = PyLong_AsLong(v);
9261 if (prec == -1 && PyErr_Occurred())
9262 goto onError;
9263 if (prec < 0)
9264 prec = 0;
9265 if (--fmtcnt >= 0)
9266 c = *fmt++;
9267 }
9268 else if (c >= '0' && c <= '9') {
9269 prec = c - '0';
9270 while (--fmtcnt >= 0) {
9271 c = Py_CHARMASK(*fmt++);
9272 if (c < '0' || c > '9')
9273 break;
9274 if ((prec*10) / 10 != prec) {
9275 PyErr_SetString(PyExc_ValueError,
9276 "prec too big");
9277 goto onError;
9278 }
9279 prec = prec*10 + (c - '0');
9280 }
9281 }
9282 } /* prec */
9283 if (fmtcnt >= 0) {
9284 if (c == 'h' || c == 'l' || c == 'L') {
9285 if (--fmtcnt >= 0)
9286 c = *fmt++;
9287 }
9288 }
9289 if (fmtcnt < 0) {
9290 PyErr_SetString(PyExc_ValueError,
9291 "incomplete format");
9292 goto onError;
9293 }
9294 if (c != '%') {
9295 v = getnextarg(args, arglen, &argidx);
9296 if (v == NULL)
9297 goto onError;
9298 }
9299 sign = 0;
9300 fill = ' ';
9301 switch (c) {
9302
9303 case '%':
9304 pbuf = formatbuf;
9305 /* presume that buffer length is at least 1 */
9306 pbuf[0] = '%';
9307 len = 1;
9308 break;
9309
9310 case 's':
9311 case 'r':
9312 case 'a':
Victor Stinnerabdb21a2010-03-22 12:53:14 +00009313 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009314 temp = v;
9315 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009316 }
9317 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 if (c == 's')
9319 temp = PyObject_Str(v);
9320 else if (c == 'r')
9321 temp = PyObject_Repr(v);
9322 else
9323 temp = PyObject_ASCII(v);
9324 if (temp == NULL)
9325 goto onError;
9326 if (PyUnicode_Check(temp))
9327 /* nothing to do */;
9328 else {
9329 Py_DECREF(temp);
9330 PyErr_SetString(PyExc_TypeError,
9331 "%s argument has non-string str()");
9332 goto onError;
9333 }
9334 }
9335 pbuf = PyUnicode_AS_UNICODE(temp);
9336 len = PyUnicode_GET_SIZE(temp);
9337 if (prec >= 0 && len > prec)
9338 len = prec;
9339 break;
9340
9341 case 'i':
9342 case 'd':
9343 case 'u':
9344 case 'o':
9345 case 'x':
9346 case 'X':
9347 if (c == 'i')
9348 c = 'd';
9349 isnumok = 0;
9350 if (PyNumber_Check(v)) {
9351 PyObject *iobj=NULL;
9352
9353 if (PyLong_Check(v)) {
9354 iobj = v;
9355 Py_INCREF(iobj);
9356 }
9357 else {
9358 iobj = PyNumber_Long(v);
9359 }
9360 if (iobj!=NULL) {
9361 if (PyLong_Check(iobj)) {
9362 isnumok = 1;
9363 temp = formatlong(iobj, flags, prec, c);
9364 Py_DECREF(iobj);
9365 if (!temp)
9366 goto onError;
9367 pbuf = PyUnicode_AS_UNICODE(temp);
9368 len = PyUnicode_GET_SIZE(temp);
9369 sign = 1;
9370 }
9371 else {
9372 Py_DECREF(iobj);
9373 }
9374 }
9375 }
9376 if (!isnumok) {
9377 PyErr_Format(PyExc_TypeError,
9378 "%%%c format: a number is required, "
9379 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9380 goto onError;
9381 }
9382 if (flags & F_ZERO)
9383 fill = '0';
9384 break;
9385
9386 case 'e':
9387 case 'E':
9388 case 'f':
9389 case 'F':
9390 case 'g':
9391 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009392 temp = formatfloat(v, flags, prec, c);
9393 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009395 pbuf = PyUnicode_AS_UNICODE(temp);
9396 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 sign = 1;
9398 if (flags & F_ZERO)
9399 fill = '0';
9400 break;
9401
9402 case 'c':
9403 pbuf = formatbuf;
9404 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9405 if (len < 0)
9406 goto onError;
9407 break;
9408
9409 default:
9410 PyErr_Format(PyExc_ValueError,
9411 "unsupported format character '%c' (0x%x) "
9412 "at index %zd",
9413 (31<=c && c<=126) ? (char)c : '?',
9414 (int)c,
9415 (Py_ssize_t)(fmt - 1 -
9416 PyUnicode_AS_UNICODE(uformat)));
9417 goto onError;
9418 }
9419 if (sign) {
9420 if (*pbuf == '-' || *pbuf == '+') {
9421 sign = *pbuf++;
9422 len--;
9423 }
9424 else if (flags & F_SIGN)
9425 sign = '+';
9426 else if (flags & F_BLANK)
9427 sign = ' ';
9428 else
9429 sign = 0;
9430 }
9431 if (width < len)
9432 width = len;
9433 if (rescnt - (sign != 0) < width) {
9434 reslen -= rescnt;
9435 rescnt = width + fmtcnt + 100;
9436 reslen += rescnt;
9437 if (reslen < 0) {
9438 Py_XDECREF(temp);
9439 PyErr_NoMemory();
9440 goto onError;
9441 }
9442 if (_PyUnicode_Resize(&result, reslen) < 0) {
9443 Py_XDECREF(temp);
9444 goto onError;
9445 }
9446 res = PyUnicode_AS_UNICODE(result)
9447 + reslen - rescnt;
9448 }
9449 if (sign) {
9450 if (fill != ' ')
9451 *res++ = sign;
9452 rescnt--;
9453 if (width > len)
9454 width--;
9455 }
9456 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9457 assert(pbuf[0] == '0');
9458 assert(pbuf[1] == c);
9459 if (fill != ' ') {
9460 *res++ = *pbuf++;
9461 *res++ = *pbuf++;
9462 }
9463 rescnt -= 2;
9464 width -= 2;
9465 if (width < 0)
9466 width = 0;
9467 len -= 2;
9468 }
9469 if (width > len && !(flags & F_LJUST)) {
9470 do {
9471 --rescnt;
9472 *res++ = fill;
9473 } while (--width > len);
9474 }
9475 if (fill == ' ') {
9476 if (sign)
9477 *res++ = sign;
9478 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9479 assert(pbuf[0] == '0');
9480 assert(pbuf[1] == c);
9481 *res++ = *pbuf++;
9482 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 }
9484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 Py_UNICODE_COPY(res, pbuf, len);
9486 res += len;
9487 rescnt -= len;
9488 while (--width >= len) {
9489 --rescnt;
9490 *res++ = ' ';
9491 }
9492 if (dict && (argidx < arglen) && c != '%') {
9493 PyErr_SetString(PyExc_TypeError,
9494 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009495 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 goto onError;
9497 }
9498 Py_XDECREF(temp);
9499 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 } /* until end */
9501 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 PyErr_SetString(PyExc_TypeError,
9503 "not all arguments converted during string formatting");
9504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505 }
9506
Thomas Woutersa96affe2006-03-12 00:29:36 +00009507 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 return (PyObject *)result;
9514
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 Py_XDECREF(result);
9517 Py_DECREF(uformat);
9518 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 }
9521 return NULL;
9522}
9523
Jeremy Hylton938ace62002-07-17 16:30:39 +00009524static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009525unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9526
Tim Peters6d6c1a32001-08-02 04:15:00 +00009527static PyObject *
9528unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9529{
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009531 static char *kwlist[] = {"object", "encoding", "errors", 0};
9532 char *encoding = NULL;
9533 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009534
Benjamin Peterson14339b62009-01-31 16:36:08 +00009535 if (type != &PyUnicode_Type)
9536 return unicode_subtype_new(type, args, kwds);
9537 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009539 return NULL;
9540 if (x == NULL)
9541 return (PyObject *)_PyUnicode_New(0);
9542 if (encoding == NULL && errors == NULL)
9543 return PyObject_Str(x);
9544 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009545 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009546}
9547
Guido van Rossume023fe02001-08-30 03:12:59 +00009548static PyObject *
9549unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9550{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009551 PyUnicodeObject *tmp, *pnew;
9552 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009553
Benjamin Peterson14339b62009-01-31 16:36:08 +00009554 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9555 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9556 if (tmp == NULL)
9557 return NULL;
9558 assert(PyUnicode_Check(tmp));
9559 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9560 if (pnew == NULL) {
9561 Py_DECREF(tmp);
9562 return NULL;
9563 }
9564 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9565 if (pnew->str == NULL) {
9566 _Py_ForgetReference((PyObject *)pnew);
9567 PyObject_Del(pnew);
9568 Py_DECREF(tmp);
9569 return PyErr_NoMemory();
9570 }
9571 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9572 pnew->length = n;
9573 pnew->hash = tmp->hash;
9574 Py_DECREF(tmp);
9575 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009576}
9577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009578PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009580\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009581Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009582encoding defaults to the current default string encoding.\n\
9583errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009584
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009585static PyObject *unicode_iter(PyObject *seq);
9586
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009588 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 "str", /* tp_name */
9590 sizeof(PyUnicodeObject), /* tp_size */
9591 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 (destructor)unicode_dealloc, /* tp_dealloc */
9594 0, /* tp_print */
9595 0, /* tp_getattr */
9596 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009597 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009598 unicode_repr, /* tp_repr */
9599 &unicode_as_number, /* tp_as_number */
9600 &unicode_as_sequence, /* tp_as_sequence */
9601 &unicode_as_mapping, /* tp_as_mapping */
9602 (hashfunc) unicode_hash, /* tp_hash*/
9603 0, /* tp_call*/
9604 (reprfunc) unicode_str, /* tp_str */
9605 PyObject_GenericGetAttr, /* tp_getattro */
9606 0, /* tp_setattro */
9607 0, /* tp_as_buffer */
9608 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 unicode_doc, /* tp_doc */
9611 0, /* tp_traverse */
9612 0, /* tp_clear */
9613 PyUnicode_RichCompare, /* tp_richcompare */
9614 0, /* tp_weaklistoffset */
9615 unicode_iter, /* tp_iter */
9616 0, /* tp_iternext */
9617 unicode_methods, /* tp_methods */
9618 0, /* tp_members */
9619 0, /* tp_getset */
9620 &PyBaseObject_Type, /* tp_base */
9621 0, /* tp_dict */
9622 0, /* tp_descr_get */
9623 0, /* tp_descr_set */
9624 0, /* tp_dictoffset */
9625 0, /* tp_init */
9626 0, /* tp_alloc */
9627 unicode_new, /* tp_new */
9628 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629};
9630
9631/* Initialize the Unicode implementation */
9632
Thomas Wouters78890102000-07-22 19:25:51 +00009633void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009635 int i;
9636
Thomas Wouters477c8d52006-05-27 19:21:47 +00009637 /* XXX - move this array to unicodectype.c ? */
9638 Py_UNICODE linebreak[] = {
9639 0x000A, /* LINE FEED */
9640 0x000D, /* CARRIAGE RETURN */
9641 0x001C, /* FILE SEPARATOR */
9642 0x001D, /* GROUP SEPARATOR */
9643 0x001E, /* RECORD SEPARATOR */
9644 0x0085, /* NEXT LINE */
9645 0x2028, /* LINE SEPARATOR */
9646 0x2029, /* PARAGRAPH SEPARATOR */
9647 };
9648
Fred Drakee4315f52000-05-09 19:53:39 +00009649 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009650 free_list = NULL;
9651 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009653 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009655
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009656 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009658 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009660
9661 /* initialize the linebreak bloom filter */
9662 bloom_linebreak = make_bloom_mask(
9663 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9664 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009665
9666 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667}
9668
9669/* Finalize the Unicode implementation */
9670
Christian Heimesa156e092008-02-16 07:38:31 +00009671int
9672PyUnicode_ClearFreeList(void)
9673{
9674 int freelist_size = numfree;
9675 PyUnicodeObject *u;
9676
9677 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 PyUnicodeObject *v = u;
9679 u = *(PyUnicodeObject **)u;
9680 if (v->str)
9681 PyObject_DEL(v->str);
9682 Py_XDECREF(v->defenc);
9683 PyObject_Del(v);
9684 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009685 }
9686 free_list = NULL;
9687 assert(numfree == 0);
9688 return freelist_size;
9689}
9690
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691void
Thomas Wouters78890102000-07-22 19:25:51 +00009692_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009694 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009696 Py_XDECREF(unicode_empty);
9697 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009698
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009699 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (unicode_latin1[i]) {
9701 Py_DECREF(unicode_latin1[i]);
9702 unicode_latin1[i] = NULL;
9703 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009704 }
Christian Heimesa156e092008-02-16 07:38:31 +00009705 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009707
Walter Dörwald16807132007-05-25 13:52:07 +00009708void
9709PyUnicode_InternInPlace(PyObject **p)
9710{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009711 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9712 PyObject *t;
9713 if (s == NULL || !PyUnicode_Check(s))
9714 Py_FatalError(
9715 "PyUnicode_InternInPlace: unicode strings only please!");
9716 /* If it's a subclass, we don't really know what putting
9717 it in the interned dict might do. */
9718 if (!PyUnicode_CheckExact(s))
9719 return;
9720 if (PyUnicode_CHECK_INTERNED(s))
9721 return;
9722 if (interned == NULL) {
9723 interned = PyDict_New();
9724 if (interned == NULL) {
9725 PyErr_Clear(); /* Don't leave an exception */
9726 return;
9727 }
9728 }
9729 /* It might be that the GetItem call fails even
9730 though the key is present in the dictionary,
9731 namely when this happens during a stack overflow. */
9732 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009735
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 if (t) {
9737 Py_INCREF(t);
9738 Py_DECREF(*p);
9739 *p = t;
9740 return;
9741 }
Walter Dörwald16807132007-05-25 13:52:07 +00009742
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 PyThreadState_GET()->recursion_critical = 1;
9744 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9745 PyErr_Clear();
9746 PyThreadState_GET()->recursion_critical = 0;
9747 return;
9748 }
9749 PyThreadState_GET()->recursion_critical = 0;
9750 /* The two references in interned are not counted by refcnt.
9751 The deallocator will take care of this */
9752 Py_REFCNT(s) -= 2;
9753 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009754}
9755
9756void
9757PyUnicode_InternImmortal(PyObject **p)
9758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009759 PyUnicode_InternInPlace(p);
9760 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9761 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9762 Py_INCREF(*p);
9763 }
Walter Dörwald16807132007-05-25 13:52:07 +00009764}
9765
9766PyObject *
9767PyUnicode_InternFromString(const char *cp)
9768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009769 PyObject *s = PyUnicode_FromString(cp);
9770 if (s == NULL)
9771 return NULL;
9772 PyUnicode_InternInPlace(&s);
9773 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009774}
9775
9776void _Py_ReleaseInternedUnicodeStrings(void)
9777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 PyObject *keys;
9779 PyUnicodeObject *s;
9780 Py_ssize_t i, n;
9781 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009782
Benjamin Peterson14339b62009-01-31 16:36:08 +00009783 if (interned == NULL || !PyDict_Check(interned))
9784 return;
9785 keys = PyDict_Keys(interned);
9786 if (keys == NULL || !PyList_Check(keys)) {
9787 PyErr_Clear();
9788 return;
9789 }
Walter Dörwald16807132007-05-25 13:52:07 +00009790
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9792 detector, interned unicode strings are not forcibly deallocated;
9793 rather, we give them their stolen references back, and then clear
9794 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009795
Benjamin Peterson14339b62009-01-31 16:36:08 +00009796 n = PyList_GET_SIZE(keys);
9797 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009799 for (i = 0; i < n; i++) {
9800 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9801 switch (s->state) {
9802 case SSTATE_NOT_INTERNED:
9803 /* XXX Shouldn't happen */
9804 break;
9805 case SSTATE_INTERNED_IMMORTAL:
9806 Py_REFCNT(s) += 1;
9807 immortal_size += s->length;
9808 break;
9809 case SSTATE_INTERNED_MORTAL:
9810 Py_REFCNT(s) += 2;
9811 mortal_size += s->length;
9812 break;
9813 default:
9814 Py_FatalError("Inconsistent interned string state.");
9815 }
9816 s->state = SSTATE_NOT_INTERNED;
9817 }
9818 fprintf(stderr, "total size of all interned strings: "
9819 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9820 "mortal/immortal\n", mortal_size, immortal_size);
9821 Py_DECREF(keys);
9822 PyDict_Clear(interned);
9823 Py_DECREF(interned);
9824 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009825}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009826
9827
9828/********************* Unicode Iterator **************************/
9829
9830typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 PyObject_HEAD
9832 Py_ssize_t it_index;
9833 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009834} unicodeiterobject;
9835
9836static void
9837unicodeiter_dealloc(unicodeiterobject *it)
9838{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009839 _PyObject_GC_UNTRACK(it);
9840 Py_XDECREF(it->it_seq);
9841 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009842}
9843
9844static int
9845unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9846{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009847 Py_VISIT(it->it_seq);
9848 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009849}
9850
9851static PyObject *
9852unicodeiter_next(unicodeiterobject *it)
9853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 PyUnicodeObject *seq;
9855 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009856
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 assert(it != NULL);
9858 seq = it->it_seq;
9859 if (seq == NULL)
9860 return NULL;
9861 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009862
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9864 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009866 if (item != NULL)
9867 ++it->it_index;
9868 return item;
9869 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009870
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 Py_DECREF(seq);
9872 it->it_seq = NULL;
9873 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009874}
9875
9876static PyObject *
9877unicodeiter_len(unicodeiterobject *it)
9878{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009879 Py_ssize_t len = 0;
9880 if (it->it_seq)
9881 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9882 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009883}
9884
9885PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9886
9887static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009888 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009891};
9892
9893PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009894 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9895 "str_iterator", /* tp_name */
9896 sizeof(unicodeiterobject), /* tp_basicsize */
9897 0, /* tp_itemsize */
9898 /* methods */
9899 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9900 0, /* tp_print */
9901 0, /* tp_getattr */
9902 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009903 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009904 0, /* tp_repr */
9905 0, /* tp_as_number */
9906 0, /* tp_as_sequence */
9907 0, /* tp_as_mapping */
9908 0, /* tp_hash */
9909 0, /* tp_call */
9910 0, /* tp_str */
9911 PyObject_GenericGetAttr, /* tp_getattro */
9912 0, /* tp_setattro */
9913 0, /* tp_as_buffer */
9914 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9915 0, /* tp_doc */
9916 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9917 0, /* tp_clear */
9918 0, /* tp_richcompare */
9919 0, /* tp_weaklistoffset */
9920 PyObject_SelfIter, /* tp_iter */
9921 (iternextfunc)unicodeiter_next, /* tp_iternext */
9922 unicodeiter_methods, /* tp_methods */
9923 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009924};
9925
9926static PyObject *
9927unicode_iter(PyObject *seq)
9928{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009930
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 if (!PyUnicode_Check(seq)) {
9932 PyErr_BadInternalCall();
9933 return NULL;
9934 }
9935 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9936 if (it == NULL)
9937 return NULL;
9938 it->it_index = 0;
9939 Py_INCREF(seq);
9940 it->it_seq = (PyUnicodeObject *)seq;
9941 _PyObject_GC_TRACK(it);
9942 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009943}
9944
Martin v. Löwis5b222132007-06-10 09:51:05 +00009945size_t
9946Py_UNICODE_strlen(const Py_UNICODE *u)
9947{
9948 int res = 0;
9949 while(*u++)
9950 res++;
9951 return res;
9952}
9953
9954Py_UNICODE*
9955Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9956{
9957 Py_UNICODE *u = s1;
9958 while ((*u++ = *s2++));
9959 return s1;
9960}
9961
9962Py_UNICODE*
9963Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9964{
9965 Py_UNICODE *u = s1;
9966 while ((*u++ = *s2++))
9967 if (n-- == 0)
9968 break;
9969 return s1;
9970}
9971
9972int
9973Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9974{
9975 while (*s1 && *s2 && *s1 == *s2)
9976 s1++, s2++;
9977 if (*s1 && *s2)
9978 return (*s1 < *s2) ? -1 : +1;
9979 if (*s1)
9980 return 1;
9981 if (*s2)
9982 return -1;
9983 return 0;
9984}
9985
9986Py_UNICODE*
9987Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9988{
9989 const Py_UNICODE *p;
9990 for (p = s; *p; p++)
9991 if (*p == c)
9992 return (Py_UNICODE*)p;
9993 return NULL;
9994}
9995
9996
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009997#ifdef __cplusplus
9998}
9999#endif
10000
10001
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010002/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 Local variables:
10004 c-basic-offset: 4
10005 indent-tabs-mode: nil
10006 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010007*/