blob: cdb739a3d5db1d36867254fe0da8e653312cfaf9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000794 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis011e8422009-05-05 04:43:17 +00001533/* Convert the argument to a bytes object, according to the file
1534 system encoding */
1535
1536int
1537PyUnicode_FSConverter(PyObject* arg, void* addr)
1538{
1539 PyObject *output = NULL;
1540 Py_ssize_t size;
1541 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001542 if (arg == NULL) {
1543 Py_DECREF(*(PyObject**)addr);
1544 return 1;
1545 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001546 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1547 output = arg;
1548 Py_INCREF(output);
1549 }
1550 else {
1551 arg = PyUnicode_FromObject(arg);
1552 if (!arg)
1553 return 0;
1554 output = PyUnicode_AsEncodedObject(arg,
1555 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001556 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001557 Py_DECREF(arg);
1558 if (!output)
1559 return 0;
1560 if (!PyBytes_Check(output)) {
1561 Py_DECREF(output);
1562 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1563 return 0;
1564 }
1565 }
1566 if (PyBytes_Check(output)) {
1567 size = PyBytes_GET_SIZE(output);
1568 data = PyBytes_AS_STRING(output);
1569 }
1570 else {
1571 size = PyByteArray_GET_SIZE(output);
1572 data = PyByteArray_AS_STRING(output);
1573 }
1574 if (size != strlen(data)) {
1575 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1576 Py_DECREF(output);
1577 return 0;
1578 }
1579 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001580 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001581}
1582
1583
Martin v. Löwis5b222132007-06-10 09:51:05 +00001584char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001585_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001586{
Christian Heimesf3863112007-11-22 07:46:41 +00001587 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001588 if (!PyUnicode_Check(unicode)) {
1589 PyErr_BadArgument();
1590 return NULL;
1591 }
Christian Heimesf3863112007-11-22 07:46:41 +00001592 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1593 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001594 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001595 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001596 *psize = PyBytes_GET_SIZE(bytes);
1597 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598}
1599
1600char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001601_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001602{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001603 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001604}
1605
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1607{
1608 if (!PyUnicode_Check(unicode)) {
1609 PyErr_BadArgument();
1610 goto onError;
1611 }
1612 return PyUnicode_AS_UNICODE(unicode);
1613
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 return NULL;
1616}
1617
Martin v. Löwis18e16552006-02-15 17:27:45 +00001618Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619{
1620 if (!PyUnicode_Check(unicode)) {
1621 PyErr_BadArgument();
1622 goto onError;
1623 }
1624 return PyUnicode_GET_SIZE(unicode);
1625
Benjamin Peterson29060642009-01-31 22:14:21 +00001626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 return -1;
1628}
1629
Thomas Wouters78890102000-07-22 19:25:51 +00001630const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001631{
1632 return unicode_default_encoding;
1633}
1634
1635int PyUnicode_SetDefaultEncoding(const char *encoding)
1636{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001637 if (strcmp(encoding, unicode_default_encoding) != 0) {
1638 PyErr_Format(PyExc_ValueError,
1639 "Can only set default encoding to %s",
1640 unicode_default_encoding);
1641 return -1;
1642 }
Fred Drakee4315f52000-05-09 19:53:39 +00001643 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001644}
1645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646/* error handling callback helper:
1647 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001648 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 and adjust various state variables.
1650 return 0 on success, -1 on error
1651*/
1652
1653static
1654int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001655 const char *encoding, const char *reason,
1656 const char **input, const char **inend, Py_ssize_t *startinpos,
1657 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1658 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001660 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661
1662 PyObject *restuple = NULL;
1663 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001664 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001665 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t requiredsize;
1667 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001669 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001670 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 int res = -1;
1672
1673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 *errorHandler = PyCodec_LookupError(errors);
1675 if (*errorHandler == NULL)
1676 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 }
1678
1679 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1682 if (*exceptionObject == NULL)
1683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 }
1685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001686 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1687 goto onError;
1688 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1689 goto onError;
1690 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 }
1693
1694 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1695 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001696 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001698 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 }
1701 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001703
1704 /* Copy back the bytes variables, which might have been modified by the
1705 callback */
1706 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1707 if (!inputobj)
1708 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001709 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001710 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001711 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001712 *input = PyBytes_AS_STRING(inputobj);
1713 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001715 /* we can DECREF safely, as the exception has another reference,
1716 so the object won't go away. */
1717 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001720 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001721 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1723 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725
1726 /* need more space? (at least enough for what we
1727 have+the replacement+the rest of the string (starting
1728 at the new input position), so we won't have to check space
1729 when there are no errors in the rest of the string) */
1730 repptr = PyUnicode_AS_UNICODE(repunicode);
1731 repsize = PyUnicode_GET_SIZE(repunicode);
1732 requiredsize = *outpos + repsize + insize-newpos;
1733 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 if (requiredsize<2*outsize)
1735 requiredsize = 2*outsize;
1736 if (_PyUnicode_Resize(output, requiredsize) < 0)
1737 goto onError;
1738 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 }
1740 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001741 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_UNICODE_COPY(*outptr, repptr, repsize);
1743 *outptr += repsize;
1744 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 /* we made it! */
1747 res = 0;
1748
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 Py_XDECREF(restuple);
1751 return res;
1752}
1753
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754/* --- UTF-7 Codec -------------------------------------------------------- */
1755
Antoine Pitrou244651a2009-05-04 18:56:13 +00001756/* See RFC2152 for details. We encode conservatively and decode liberally. */
1757
1758/* Three simple macros defining base-64. */
1759
1760/* Is c a base-64 character? */
1761
1762#define IS_BASE64(c) \
1763 (((c) >= 'A' && (c) <= 'Z') || \
1764 ((c) >= 'a' && (c) <= 'z') || \
1765 ((c) >= '0' && (c) <= '9') || \
1766 (c) == '+' || (c) == '/')
1767
1768/* given that c is a base-64 character, what is its base-64 value? */
1769
1770#define FROM_BASE64(c) \
1771 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1772 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1773 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1774 (c) == '+' ? 62 : 63)
1775
1776/* What is the base-64 character of the bottom 6 bits of n? */
1777
1778#define TO_BASE64(n) \
1779 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1780
1781/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1782 * decoded as itself. We are permissive on decoding; the only ASCII
1783 * byte not decoding to itself is the + which begins a base64
1784 * string. */
1785
1786#define DECODE_DIRECT(c) \
1787 ((c) <= 127 && (c) != '+')
1788
1789/* The UTF-7 encoder treats ASCII characters differently according to
1790 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1791 * the above). See RFC2152. This array identifies these different
1792 * sets:
1793 * 0 : "Set D"
1794 * alphanumeric and '(),-./:?
1795 * 1 : "Set O"
1796 * !"#$%&*;<=>@[]^_`{|}
1797 * 2 : "whitespace"
1798 * ht nl cr sp
1799 * 3 : special (must be base64 encoded)
1800 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1801 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802
Tim Petersced69f82003-09-16 20:30:58 +00001803static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001804char utf7_category[128] = {
1805/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1806 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1807/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1808 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1809/* sp ! " # $ % & ' ( ) * + , - . / */
1810 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1811/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1813/* @ A B C D E F G H I J K L M N O */
1814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1815/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1817/* ` a b c d e f g h i j k l m n o */
1818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1819/* p q r s t u v w x y z { | } ~ del */
1820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821};
1822
Antoine Pitrou244651a2009-05-04 18:56:13 +00001823/* ENCODE_DIRECT: this character should be encoded as itself. The
1824 * answer depends on whether we are encoding set O as itself, and also
1825 * on whether we are encoding whitespace as itself. RFC2152 makes it
1826 * clear that the answers to these questions vary between
1827 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001828
Antoine Pitrou244651a2009-05-04 18:56:13 +00001829#define ENCODE_DIRECT(c, directO, directWS) \
1830 ((c) < 128 && (c) > 0 && \
1831 ((utf7_category[(c)] == 0) || \
1832 (directWS && (utf7_category[(c)] == 2)) || \
1833 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 Py_ssize_t size,
1837 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001839 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1840}
1841
Antoine Pitrou244651a2009-05-04 18:56:13 +00001842/* The decoder. The only state we preserve is our read position,
1843 * i.e. how many characters we have consumed. So if we end in the
1844 * middle of a shift sequence we have to back off the read position
1845 * and the output to the beginning of the sequence, otherwise we lose
1846 * all the shift state (seen bits, number of bits seen, high
1847 * surrogate). */
1848
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001849PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001850 Py_ssize_t size,
1851 const char *errors,
1852 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001855 Py_ssize_t startinpos;
1856 Py_ssize_t endinpos;
1857 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 const char *e;
1859 PyUnicodeObject *unicode;
1860 Py_UNICODE *p;
1861 const char *errmsg = "";
1862 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001863 Py_UNICODE *shiftOutStart;
1864 unsigned int base64bits = 0;
1865 unsigned long base64buffer = 0;
1866 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 PyObject *errorHandler = NULL;
1868 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869
1870 unicode = _PyUnicode_New(size);
1871 if (!unicode)
1872 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001873 if (size == 0) {
1874 if (consumed)
1875 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001876 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001877 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001878
1879 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001880 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 e = s + size;
1882
1883 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001885 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001886 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887
Antoine Pitrou244651a2009-05-04 18:56:13 +00001888 if (inShift) { /* in a base-64 section */
1889 if (IS_BASE64(ch)) { /* consume a base-64 character */
1890 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1891 base64bits += 6;
1892 s++;
1893 if (base64bits >= 16) {
1894 /* we have enough bits for a UTF-16 value */
1895 Py_UNICODE outCh = (Py_UNICODE)
1896 (base64buffer >> (base64bits-16));
1897 base64bits -= 16;
1898 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1899 if (surrogate) {
1900 /* expecting a second surrogate */
1901 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1902#ifdef Py_UNICODE_WIDE
1903 *p++ = (((surrogate & 0x3FF)<<10)
1904 | (outCh & 0x3FF)) + 0x10000;
1905#else
1906 *p++ = surrogate;
1907 *p++ = outCh;
1908#endif
1909 surrogate = 0;
1910 }
1911 else {
1912 surrogate = 0;
1913 errmsg = "second surrogate missing";
1914 goto utf7Error;
1915 }
1916 }
1917 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1918 /* first surrogate */
1919 surrogate = outCh;
1920 }
1921 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1922 errmsg = "unexpected second surrogate";
1923 goto utf7Error;
1924 }
1925 else {
1926 *p++ = outCh;
1927 }
1928 }
1929 }
1930 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001931 inShift = 0;
1932 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001933 if (surrogate) {
1934 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001937 if (base64bits > 0) { /* left-over bits */
1938 if (base64bits >= 6) {
1939 /* We've seen at least one base-64 character */
1940 errmsg = "partial character in shift sequence";
1941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001942 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001943 else {
1944 /* Some bits remain; they should be zero */
1945 if (base64buffer != 0) {
1946 errmsg = "non-zero padding bits in shift sequence";
1947 goto utf7Error;
1948 }
1949 }
1950 }
1951 if (ch != '-') {
1952 /* '-' is absorbed; other terminating
1953 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954 *p++ = ch;
1955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956 }
1957 }
1958 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001960 s++; /* consume '+' */
1961 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 s++;
1963 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964 }
1965 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001966 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 shiftOutStart = p;
1968 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969 }
1970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001971 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 *p++ = ch;
1973 s++;
1974 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 else {
1976 startinpos = s-starts;
1977 s++;
1978 errmsg = "unexpected special character";
1979 goto utf7Error;
1980 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001981 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001982utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001983 outpos = p-PyUnicode_AS_UNICODE(unicode);
1984 endinpos = s-starts;
1985 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 errors, &errorHandler,
1987 "utf7", errmsg,
1988 &starts, &e, &startinpos, &endinpos, &exc, &s,
1989 &unicode, &outpos, &p))
1990 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001991 }
1992
Antoine Pitrou244651a2009-05-04 18:56:13 +00001993 /* end of string */
1994
1995 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1996 /* if we're in an inconsistent state, that's an error */
1997 if (surrogate ||
1998 (base64bits >= 6) ||
1999 (base64bits > 0 && base64buffer != 0)) {
2000 outpos = p-PyUnicode_AS_UNICODE(unicode);
2001 endinpos = size;
2002 if (unicode_decode_call_errorhandler(
2003 errors, &errorHandler,
2004 "utf7", "unterminated shift sequence",
2005 &starts, &e, &startinpos, &endinpos, &exc, &s,
2006 &unicode, &outpos, &p))
2007 goto onError;
2008 if (s < e)
2009 goto restart;
2010 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012
2013 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002014 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 if (inShift) {
2016 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 }
2019 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002024 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 goto onError;
2026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 Py_XDECREF(errorHandler);
2028 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 return (PyObject *)unicode;
2030
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 Py_XDECREF(errorHandler);
2033 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034 Py_DECREF(unicode);
2035 return NULL;
2036}
2037
2038
2039PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 int base64SetO,
2042 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002045 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002046 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002047 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002049 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002050 unsigned int base64bits = 0;
2051 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002052 char * out;
2053 char * start;
2054
2055 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002057
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002058 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002059 return PyErr_NoMemory();
2060
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 if (v == NULL)
2063 return NULL;
2064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002065 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066 for (;i < size; ++i) {
2067 Py_UNICODE ch = s[i];
2068
Antoine Pitrou244651a2009-05-04 18:56:13 +00002069 if (inShift) {
2070 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2071 /* shifting out */
2072 if (base64bits) { /* output remaining bits */
2073 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2074 base64buffer = 0;
2075 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002076 }
2077 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002078 /* Characters not in the BASE64 set implicitly unshift the sequence
2079 so no '-' is required, except if the character is itself a '-' */
2080 if (IS_BASE64(ch) || ch == '-') {
2081 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002082 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083 *out++ = (char) ch;
2084 }
2085 else {
2086 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002087 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 else { /* not in a shift sequence */
2090 if (ch == '+') {
2091 *out++ = '+';
2092 *out++ = '-';
2093 }
2094 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2095 *out++ = (char) ch;
2096 }
2097 else {
2098 *out++ = '+';
2099 inShift = 1;
2100 goto encode_char;
2101 }
2102 }
2103 continue;
2104encode_char:
2105#ifdef Py_UNICODE_WIDE
2106 if (ch >= 0x10000) {
2107 /* code first surrogate */
2108 base64bits += 16;
2109 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2110 while (base64bits >= 6) {
2111 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2112 base64bits -= 6;
2113 }
2114 /* prepare second surrogate */
2115 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2116 }
2117#endif
2118 base64bits += 16;
2119 base64buffer = (base64buffer << 16) | ch;
2120 while (base64bits >= 6) {
2121 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2122 base64bits -= 6;
2123 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002124 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002125 if (base64bits)
2126 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2127 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002128 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002129 if (_PyBytes_Resize(&v, out - start) < 0)
2130 return NULL;
2131 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132}
2133
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134#undef IS_BASE64
2135#undef FROM_BASE64
2136#undef TO_BASE64
2137#undef DECODE_DIRECT
2138#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140/* --- UTF-8 Codec -------------------------------------------------------- */
2141
Tim Petersced69f82003-09-16 20:30:58 +00002142static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143char utf8_code_length[256] = {
2144 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2145 illegal prefix. see RFC 2279 for details */
2146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2158 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2159 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2160 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2161 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2162};
2163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 Py_ssize_t size,
2166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
Walter Dörwald69652032004-09-07 20:24:22 +00002168 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2169}
2170
Antoine Pitrouab868312009-01-10 15:40:25 +00002171/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2172#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2173
2174/* Mask to quickly check whether a C 'long' contains a
2175 non-ASCII, UTF8-encoded char. */
2176#if (SIZEOF_LONG == 8)
2177# define ASCII_CHAR_MASK 0x8080808080808080L
2178#elif (SIZEOF_LONG == 4)
2179# define ASCII_CHAR_MASK 0x80808080L
2180#else
2181# error C 'long' size should be either 4 or 8!
2182#endif
2183
Walter Dörwald69652032004-09-07 20:24:22 +00002184PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002185 Py_ssize_t size,
2186 const char *errors,
2187 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002188{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002191 Py_ssize_t startinpos;
2192 Py_ssize_t endinpos;
2193 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002194 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 PyUnicodeObject *unicode;
2196 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002197 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002198 PyObject *errorHandler = NULL;
2199 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200
2201 /* Note: size will always be longer than the resulting Unicode
2202 character count */
2203 unicode = _PyUnicode_New(size);
2204 if (!unicode)
2205 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002206 if (size == 0) {
2207 if (consumed)
2208 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
2212 /* Unpack UTF-8 encoded data */
2213 p = unicode->str;
2214 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002215 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002218 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219
2220 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002221 /* Fast path for runs of ASCII characters. Given that common UTF-8
2222 input will consist of an overwhelming majority of ASCII
2223 characters, we try to optimize for this case by checking
2224 as many characters as a C 'long' can contain.
2225 First, check if we can do an aligned read, as most CPUs have
2226 a penalty for unaligned reads.
2227 */
2228 if (!((size_t) s & LONG_PTR_MASK)) {
2229 /* Help register allocation */
2230 register const char *_s = s;
2231 register Py_UNICODE *_p = p;
2232 while (_s < aligned_end) {
2233 /* Read a whole long at a time (either 4 or 8 bytes),
2234 and do a fast unrolled copy if it only contains ASCII
2235 characters. */
2236 unsigned long data = *(unsigned long *) _s;
2237 if (data & ASCII_CHAR_MASK)
2238 break;
2239 _p[0] = (unsigned char) _s[0];
2240 _p[1] = (unsigned char) _s[1];
2241 _p[2] = (unsigned char) _s[2];
2242 _p[3] = (unsigned char) _s[3];
2243#if (SIZEOF_LONG == 8)
2244 _p[4] = (unsigned char) _s[4];
2245 _p[5] = (unsigned char) _s[5];
2246 _p[6] = (unsigned char) _s[6];
2247 _p[7] = (unsigned char) _s[7];
2248#endif
2249 _s += SIZEOF_LONG;
2250 _p += SIZEOF_LONG;
2251 }
2252 s = _s;
2253 p = _p;
2254 if (s == e)
2255 break;
2256 ch = (unsigned char)*s;
2257 }
2258 }
2259
2260 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002261 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 s++;
2263 continue;
2264 }
2265
2266 n = utf8_code_length[ch];
2267
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002268 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 if (consumed)
2270 break;
2271 else {
2272 errmsg = "unexpected end of data";
2273 startinpos = s-starts;
2274 endinpos = size;
2275 goto utf8Error;
2276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278
2279 switch (n) {
2280
2281 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002282 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 startinpos = s-starts;
2284 endinpos = startinpos+1;
2285 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286
2287 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002288 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 if ((s[1] & 0xc0) != 0x80) {
2295 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002296 startinpos = s-starts;
2297 endinpos = startinpos+2;
2298 goto utf8Error;
2299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002301 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
2303 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002304 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002305 goto utf8Error;
2306 }
2307 else
2308 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 break;
2310
2311 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002312 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002313 (s[2] & 0xc0) != 0x80) {
2314 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002315 startinpos = s-starts;
2316 endinpos = startinpos+3;
2317 goto utf8Error;
2318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002320 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002321 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002322 startinpos = s-starts;
2323 endinpos = startinpos+3;
2324 goto utf8Error;
2325 }
2326 else
2327 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002328 break;
2329
2330 case 4:
2331 if ((s[1] & 0xc0) != 0x80 ||
2332 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002333 (s[3] & 0xc0) != 0x80) {
2334 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002335 startinpos = s-starts;
2336 endinpos = startinpos+4;
2337 goto utf8Error;
2338 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002339 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002341 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002342 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002344 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002345 UTF-16 */
2346 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002347 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 startinpos = s-starts;
2349 endinpos = startinpos+4;
2350 goto utf8Error;
2351 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002352#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002354#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002355 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002356
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002357 /* translate from 10000..10FFFF to 0..FFFF */
2358 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002359
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002360 /* high surrogate = top 10 bits added to D800 */
2361 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002363 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002364 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002365#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 break;
2367
2368 default:
2369 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002370 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002371 startinpos = s-starts;
2372 endinpos = startinpos+n;
2373 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 }
2375 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Benjamin Peterson29060642009-01-31 22:14:21 +00002378 utf8Error:
2379 outpos = p-PyUnicode_AS_UNICODE(unicode);
2380 if (unicode_decode_call_errorhandler(
2381 errors, &errorHandler,
2382 "utf8", errmsg,
2383 &starts, &e, &startinpos, &endinpos, &exc, &s,
2384 &unicode, &outpos, &p))
2385 goto onError;
2386 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 }
Walter Dörwald69652032004-09-07 20:24:22 +00002388 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002392 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 goto onError;
2394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002395 Py_XDECREF(errorHandler);
2396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 return (PyObject *)unicode;
2398
Benjamin Peterson29060642009-01-31 22:14:21 +00002399 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 Py_XDECREF(errorHandler);
2401 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 Py_DECREF(unicode);
2403 return NULL;
2404}
2405
Antoine Pitrouab868312009-01-10 15:40:25 +00002406#undef ASCII_CHAR_MASK
2407
2408
Tim Peters602f7402002-04-27 18:03:26 +00002409/* Allocation strategy: if the string is short, convert into a stack buffer
2410 and allocate exactly as much space needed at the end. Else allocate the
2411 maximum possible needed (4 result bytes per Unicode character), and return
2412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002414PyObject *
2415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 Py_ssize_t size,
2417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418{
Tim Peters602f7402002-04-27 18:03:26 +00002419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002420
Guido van Rossum98297ee2007-11-06 21:34:58 +00002421 Py_ssize_t i; /* index into s of next input byte */
2422 PyObject *result; /* result string object */
2423 char *p; /* next free byte in output buffer */
2424 Py_ssize_t nallocated; /* number of result bytes allocated */
2425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002427 PyObject *errorHandler = NULL;
2428 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002429
Tim Peters602f7402002-04-27 18:03:26 +00002430 assert(s != NULL);
2431 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432
Tim Peters602f7402002-04-27 18:03:26 +00002433 if (size <= MAX_SHORT_UNICHARS) {
2434 /* Write into the stack buffer; nallocated can't overflow.
2435 * At the end, we'll allocate exactly as much heap space as it
2436 * turns out we need.
2437 */
2438 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002439 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002440 p = stackbuf;
2441 }
2442 else {
2443 /* Overallocate on the heap, and give the excess back at the end. */
2444 nallocated = size * 4;
2445 if (nallocated / 4 != size) /* overflow! */
2446 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002447 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002448 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002449 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002450 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002451 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002452
Tim Peters602f7402002-04-27 18:03:26 +00002453 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002454 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002455
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002456 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002457 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002459
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002461 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002462 *p++ = (char)(0xc0 | (ch >> 6));
2463 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002464 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002465 else {
Tim Peters602f7402002-04-27 18:03:26 +00002466 /* Encode UCS2 Unicode ordinals */
2467 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002468#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002469 /* Special case: check for high surrogate */
2470 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2471 Py_UCS4 ch2 = s[i];
2472 /* Check for low surrogate and combine the two to
2473 form a UCS4 value */
2474 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002475 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002476 i++;
2477 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002478 }
Tim Peters602f7402002-04-27 18:03:26 +00002479 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002480 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002481#endif
2482 if (ch >= 0xd800 && ch <= 0xdfff) {
2483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 char *prep;
2486 int k;
2487 rep = unicode_encode_call_errorhandler
2488 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2489 s, size, &exc, i-1, i, &newpos);
2490 if (!rep)
2491 goto error;
2492 /* Implementation limitations: only support error handler that return
2493 bytes, and only support up to four replacement bytes. */
2494 if (!PyBytes_Check(rep)) {
2495 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2496 Py_DECREF(rep);
2497 goto error;
2498 }
2499 if (PyBytes_Size(rep) > 4) {
2500 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2501 Py_DECREF(rep);
2502 goto error;
2503 }
2504 prep = PyBytes_AsString(rep);
2505 for(k = PyBytes_Size(rep); k > 0; k--)
2506 *p++ = *prep++;
2507 Py_DECREF(rep);
2508 continue;
2509
2510 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002511 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002512 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2513 *p++ = (char)(0x80 | (ch & 0x3f));
2514 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002516#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002517 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002518#endif
Tim Peters602f7402002-04-27 18:03:26 +00002519 /* Encode UCS4 Unicode ordinals */
2520 *p++ = (char)(0xf0 | (ch >> 18));
2521 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2522 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2523 *p++ = (char)(0x80 | (ch & 0x3f));
2524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002526
Guido van Rossum98297ee2007-11-06 21:34:58 +00002527 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002528 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002529 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002530 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
2533 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002534 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002535 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002536 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002537 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002538 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002539 Py_XDECREF(errorHandler);
2540 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002541 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002542 error:
2543 Py_XDECREF(errorHandler);
2544 Py_XDECREF(exc);
2545 Py_XDECREF(result);
2546 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002547
Tim Peters602f7402002-04-27 18:03:26 +00002548#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549}
2550
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2552{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (!PyUnicode_Check(unicode)) {
2554 PyErr_BadArgument();
2555 return NULL;
2556 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002557 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002558 PyUnicode_GET_SIZE(unicode),
2559 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560}
2561
Walter Dörwald41980ca2007-08-16 21:55:45 +00002562/* --- UTF-32 Codec ------------------------------------------------------- */
2563
2564PyObject *
2565PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002566 Py_ssize_t size,
2567 const char *errors,
2568 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002569{
2570 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2571}
2572
2573PyObject *
2574PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002575 Py_ssize_t size,
2576 const char *errors,
2577 int *byteorder,
2578 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002579{
2580 const char *starts = s;
2581 Py_ssize_t startinpos;
2582 Py_ssize_t endinpos;
2583 Py_ssize_t outpos;
2584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
2586#ifndef Py_UNICODE_WIDE
2587 int i, pairs;
2588#else
2589 const int pairs = 0;
2590#endif
2591 const unsigned char *q, *e;
2592 int bo = 0; /* assume native ordering by default */
2593 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002594 /* Offsets from q for retrieving bytes in the right order. */
2595#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2596 int iorder[] = {0, 1, 2, 3};
2597#else
2598 int iorder[] = {3, 2, 1, 0};
2599#endif
2600 PyObject *errorHandler = NULL;
2601 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002602 /* On narrow builds we split characters outside the BMP into two
2603 codepoints => count how much extra space we need. */
2604#ifndef Py_UNICODE_WIDE
2605 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 if (((Py_UCS4 *)s)[i] >= 0x10000)
2607 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002608#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002609
2610 /* This might be one to much, because of a BOM */
2611 unicode = _PyUnicode_New((size+3)/4+pairs);
2612 if (!unicode)
2613 return NULL;
2614 if (size == 0)
2615 return (PyObject *)unicode;
2616
2617 /* Unpack UTF-32 encoded data */
2618 p = unicode->str;
2619 q = (unsigned char *)s;
2620 e = q + size;
2621
2622 if (byteorder)
2623 bo = *byteorder;
2624
2625 /* Check for BOM marks (U+FEFF) in the input and adjust current
2626 byte order setting accordingly. In native mode, the leading BOM
2627 mark is skipped, in all other modes, it is copied to the output
2628 stream as-is (giving a ZWNBSP character). */
2629 if (bo == 0) {
2630 if (size >= 4) {
2631 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002633#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 if (bom == 0x0000FEFF) {
2635 q += 4;
2636 bo = -1;
2637 }
2638 else if (bom == 0xFFFE0000) {
2639 q += 4;
2640 bo = 1;
2641 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002642#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 if (bom == 0x0000FEFF) {
2644 q += 4;
2645 bo = 1;
2646 }
2647 else if (bom == 0xFFFE0000) {
2648 q += 4;
2649 bo = -1;
2650 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002651#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002653 }
2654
2655 if (bo == -1) {
2656 /* force LE */
2657 iorder[0] = 0;
2658 iorder[1] = 1;
2659 iorder[2] = 2;
2660 iorder[3] = 3;
2661 }
2662 else if (bo == 1) {
2663 /* force BE */
2664 iorder[0] = 3;
2665 iorder[1] = 2;
2666 iorder[2] = 1;
2667 iorder[3] = 0;
2668 }
2669
2670 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 Py_UCS4 ch;
2672 /* remaining bytes at the end? (size should be divisible by 4) */
2673 if (e-q<4) {
2674 if (consumed)
2675 break;
2676 errmsg = "truncated data";
2677 startinpos = ((const char *)q)-starts;
2678 endinpos = ((const char *)e)-starts;
2679 goto utf32Error;
2680 /* The remaining input chars are ignored if the callback
2681 chooses to skip the input */
2682 }
2683 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2684 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002685
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 if (ch >= 0x110000)
2687 {
2688 errmsg = "codepoint not in range(0x110000)";
2689 startinpos = ((const char *)q)-starts;
2690 endinpos = startinpos+4;
2691 goto utf32Error;
2692 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002693#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 if (ch >= 0x10000)
2695 {
2696 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2697 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2698 }
2699 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002700#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 *p++ = ch;
2702 q += 4;
2703 continue;
2704 utf32Error:
2705 outpos = p-PyUnicode_AS_UNICODE(unicode);
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "utf32", errmsg,
2709 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2710 &unicode, &outpos, &p))
2711 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002712 }
2713
2714 if (byteorder)
2715 *byteorder = bo;
2716
2717 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719
2720 /* Adjust length */
2721 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2722 goto onError;
2723
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return (PyObject *)unicode;
2727
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002729 Py_DECREF(unicode);
2730 Py_XDECREF(errorHandler);
2731 Py_XDECREF(exc);
2732 return NULL;
2733}
2734
2735PyObject *
2736PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 Py_ssize_t size,
2738 const char *errors,
2739 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002740{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002741 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002743 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002744#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002745 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002746#else
2747 const int pairs = 0;
2748#endif
2749 /* Offsets from p for storing byte pairs in the right order. */
2750#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2751 int iorder[] = {0, 1, 2, 3};
2752#else
2753 int iorder[] = {3, 2, 1, 0};
2754#endif
2755
Benjamin Peterson29060642009-01-31 22:14:21 +00002756#define STORECHAR(CH) \
2757 do { \
2758 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2759 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2760 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2761 p[iorder[0]] = (CH) & 0xff; \
2762 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763 } while(0)
2764
2765 /* In narrow builds we can output surrogate pairs as one codepoint,
2766 so we need less space. */
2767#ifndef Py_UNICODE_WIDE
2768 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2770 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2771 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002773 nsize = (size - pairs + (byteorder == 0));
2774 bytesize = nsize * 4;
2775 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002777 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778 if (v == NULL)
2779 return NULL;
2780
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002781 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002785 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002786
2787 if (byteorder == -1) {
2788 /* force LE */
2789 iorder[0] = 0;
2790 iorder[1] = 1;
2791 iorder[2] = 2;
2792 iorder[3] = 3;
2793 }
2794 else if (byteorder == 1) {
2795 /* force BE */
2796 iorder[0] = 3;
2797 iorder[1] = 2;
2798 iorder[2] = 1;
2799 iorder[3] = 0;
2800 }
2801
2802 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002804#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2806 Py_UCS4 ch2 = *s;
2807 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2808 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2809 s++;
2810 size--;
2811 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002813#endif
2814 STORECHAR(ch);
2815 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002816
2817 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002818 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819#undef STORECHAR
2820}
2821
2822PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 PyUnicode_GET_SIZE(unicode),
2830 NULL,
2831 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002832}
2833
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834/* --- UTF-16 Codec ------------------------------------------------------- */
2835
Tim Peters772747b2001-08-09 22:21:55 +00002836PyObject *
2837PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 Py_ssize_t size,
2839 const char *errors,
2840 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841{
Walter Dörwald69652032004-09-07 20:24:22 +00002842 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2843}
2844
Antoine Pitrouab868312009-01-10 15:40:25 +00002845/* Two masks for fast checking of whether a C 'long' may contain
2846 UTF16-encoded surrogate characters. This is an efficient heuristic,
2847 assuming that non-surrogate characters with a code point >= 0x8000 are
2848 rare in most input.
2849 FAST_CHAR_MASK is used when the input is in native byte ordering,
2850 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002851*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002852#if (SIZEOF_LONG == 8)
2853# define FAST_CHAR_MASK 0x8000800080008000L
2854# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2855#elif (SIZEOF_LONG == 4)
2856# define FAST_CHAR_MASK 0x80008000L
2857# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2858#else
2859# error C 'long' size should be either 4 or 8!
2860#endif
2861
Walter Dörwald69652032004-09-07 20:24:22 +00002862PyObject *
2863PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 Py_ssize_t size,
2865 const char *errors,
2866 int *byteorder,
2867 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002869 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002870 Py_ssize_t startinpos;
2871 Py_ssize_t endinpos;
2872 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 PyUnicodeObject *unicode;
2874 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002875 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002876 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002877 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002878 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002879 /* Offsets from q for retrieving byte pairs in the right order. */
2880#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2881 int ihi = 1, ilo = 0;
2882#else
2883 int ihi = 0, ilo = 1;
2884#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885 PyObject *errorHandler = NULL;
2886 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 /* Note: size will always be longer than the resulting Unicode
2889 character count */
2890 unicode = _PyUnicode_New(size);
2891 if (!unicode)
2892 return NULL;
2893 if (size == 0)
2894 return (PyObject *)unicode;
2895
2896 /* Unpack UTF-16 encoded data */
2897 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002898 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002899 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
2901 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002902 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002904 /* Check for BOM marks (U+FEFF) in the input and adjust current
2905 byte order setting accordingly. In native mode, the leading BOM
2906 mark is skipped, in all other modes, it is copied to the output
2907 stream as-is (giving a ZWNBSP character). */
2908 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002909 if (size >= 2) {
2910 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002911#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 if (bom == 0xFEFF) {
2913 q += 2;
2914 bo = -1;
2915 }
2916 else if (bom == 0xFFFE) {
2917 q += 2;
2918 bo = 1;
2919 }
Tim Petersced69f82003-09-16 20:30:58 +00002920#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 if (bom == 0xFEFF) {
2922 q += 2;
2923 bo = 1;
2924 }
2925 else if (bom == 0xFFFE) {
2926 q += 2;
2927 bo = -1;
2928 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002929#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Tim Peters772747b2001-08-09 22:21:55 +00002933 if (bo == -1) {
2934 /* force LE */
2935 ihi = 1;
2936 ilo = 0;
2937 }
2938 else if (bo == 1) {
2939 /* force BE */
2940 ihi = 0;
2941 ilo = 1;
2942 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002943#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2944 native_ordering = ilo < ihi;
2945#else
2946 native_ordering = ilo > ihi;
2947#endif
Tim Peters772747b2001-08-09 22:21:55 +00002948
Antoine Pitrouab868312009-01-10 15:40:25 +00002949 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002950 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002952 /* First check for possible aligned read of a C 'long'. Unaligned
2953 reads are more expensive, better to defer to another iteration. */
2954 if (!((size_t) q & LONG_PTR_MASK)) {
2955 /* Fast path for runs of non-surrogate chars. */
2956 register const unsigned char *_q = q;
2957 Py_UNICODE *_p = p;
2958 if (native_ordering) {
2959 /* Native ordering is simple: as long as the input cannot
2960 possibly contain a surrogate char, do an unrolled copy
2961 of several 16-bit code points to the target object.
2962 The non-surrogate check is done on several input bytes
2963 at a time (as many as a C 'long' can contain). */
2964 while (_q < aligned_end) {
2965 unsigned long data = * (unsigned long *) _q;
2966 if (data & FAST_CHAR_MASK)
2967 break;
2968 _p[0] = ((unsigned short *) _q)[0];
2969 _p[1] = ((unsigned short *) _q)[1];
2970#if (SIZEOF_LONG == 8)
2971 _p[2] = ((unsigned short *) _q)[2];
2972 _p[3] = ((unsigned short *) _q)[3];
2973#endif
2974 _q += SIZEOF_LONG;
2975 _p += SIZEOF_LONG / 2;
2976 }
2977 }
2978 else {
2979 /* Byteswapped ordering is similar, but we must decompose
2980 the copy bytewise, and take care of zero'ing out the
2981 upper bytes if the target object is in 32-bit units
2982 (that is, in UCS-4 builds). */
2983 while (_q < aligned_end) {
2984 unsigned long data = * (unsigned long *) _q;
2985 if (data & SWAPPED_FAST_CHAR_MASK)
2986 break;
2987 /* Zero upper bytes in UCS-4 builds */
2988#if (Py_UNICODE_SIZE > 2)
2989 _p[0] = 0;
2990 _p[1] = 0;
2991#if (SIZEOF_LONG == 8)
2992 _p[2] = 0;
2993 _p[3] = 0;
2994#endif
2995#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002996 /* Issue #4916; UCS-4 builds on big endian machines must
2997 fill the two last bytes of each 4-byte unit. */
2998#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2999# define OFF 2
3000#else
3001# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003002#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003003 ((unsigned char *) _p)[OFF + 1] = _q[0];
3004 ((unsigned char *) _p)[OFF + 0] = _q[1];
3005 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3006 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3007#if (SIZEOF_LONG == 8)
3008 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3009 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3010 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3011 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3012#endif
3013#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003014 _q += SIZEOF_LONG;
3015 _p += SIZEOF_LONG / 2;
3016 }
3017 }
3018 p = _p;
3019 q = _q;
3020 if (q >= e)
3021 break;
3022 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024
Benjamin Peterson14339b62009-01-31 16:36:08 +00003025 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003026
3027 if (ch < 0xD800 || ch > 0xDFFF) {
3028 *p++ = ch;
3029 continue;
3030 }
3031
3032 /* UTF-16 code pair: */
3033 if (q > e) {
3034 errmsg = "unexpected end of data";
3035 startinpos = (((const char *)q) - 2) - starts;
3036 endinpos = ((const char *)e) + 1 - starts;
3037 goto utf16Error;
3038 }
3039 if (0xD800 <= ch && ch <= 0xDBFF) {
3040 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3041 q += 2;
3042 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003043#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 *p++ = ch;
3045 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003048#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 continue;
3050 }
3051 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003052 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 startinpos = (((const char *)q)-4)-starts;
3054 endinpos = startinpos+2;
3055 goto utf16Error;
3056 }
3057
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 errmsg = "illegal encoding";
3060 startinpos = (((const char *)q)-2)-starts;
3061 endinpos = startinpos+2;
3062 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003063
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 utf16Error:
3065 outpos = p - PyUnicode_AS_UNICODE(unicode);
3066 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003067 errors,
3068 &errorHandler,
3069 "utf16", errmsg,
3070 &starts,
3071 (const char **)&e,
3072 &startinpos,
3073 &endinpos,
3074 &exc,
3075 (const char **)&q,
3076 &unicode,
3077 &outpos,
3078 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003081 /* remaining byte at the end? (size should be even) */
3082 if (e == q) {
3083 if (!consumed) {
3084 errmsg = "truncated data";
3085 startinpos = ((const char *)q) - starts;
3086 endinpos = ((const char *)e) + 1 - starts;
3087 outpos = p - PyUnicode_AS_UNICODE(unicode);
3088 if (unicode_decode_call_errorhandler(
3089 errors,
3090 &errorHandler,
3091 "utf16", errmsg,
3092 &starts,
3093 (const char **)&e,
3094 &startinpos,
3095 &endinpos,
3096 &exc,
3097 (const char **)&q,
3098 &unicode,
3099 &outpos,
3100 &p))
3101 goto onError;
3102 /* The remaining input chars are ignored if the callback
3103 chooses to skip the input */
3104 }
3105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106
3107 if (byteorder)
3108 *byteorder = bo;
3109
Walter Dörwald69652032004-09-07 20:24:22 +00003110 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003114 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 goto onError;
3116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 Py_XDECREF(errorHandler);
3118 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 return (PyObject *)unicode;
3120
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 Py_XDECREF(errorHandler);
3124 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 return NULL;
3126}
3127
Antoine Pitrouab868312009-01-10 15:40:25 +00003128#undef FAST_CHAR_MASK
3129#undef SWAPPED_FAST_CHAR_MASK
3130
Tim Peters772747b2001-08-09 22:21:55 +00003131PyObject *
3132PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 Py_ssize_t size,
3134 const char *errors,
3135 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003137 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003138 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003139 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003140#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003141 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003142#else
3143 const int pairs = 0;
3144#endif
Tim Peters772747b2001-08-09 22:21:55 +00003145 /* Offsets from p for storing byte pairs in the right order. */
3146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3147 int ihi = 1, ilo = 0;
3148#else
3149 int ihi = 0, ilo = 1;
3150#endif
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152#define STORECHAR(CH) \
3153 do { \
3154 p[ihi] = ((CH) >> 8) & 0xff; \
3155 p[ilo] = (CH) & 0xff; \
3156 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003157 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003159#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003160 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 if (s[i] >= 0x10000)
3162 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003163#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003164 /* 2 * (size + pairs + (byteorder == 0)) */
3165 if (size > PY_SSIZE_T_MAX ||
3166 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003168 nsize = size + pairs + (byteorder == 0);
3169 bytesize = nsize * 2;
3170 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003172 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (v == NULL)
3174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003176 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003179 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003180 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003181
3182 if (byteorder == -1) {
3183 /* force LE */
3184 ihi = 1;
3185 ilo = 0;
3186 }
3187 else if (byteorder == 1) {
3188 /* force BE */
3189 ihi = 0;
3190 ilo = 1;
3191 }
3192
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003193 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 Py_UNICODE ch = *s++;
3195 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003196#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 if (ch >= 0x10000) {
3198 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3199 ch = 0xD800 | ((ch-0x10000) >> 10);
3200 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003201#endif
Tim Peters772747b2001-08-09 22:21:55 +00003202 STORECHAR(ch);
3203 if (ch2)
3204 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003205 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003206
3207 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003208 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003209#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210}
3211
3212PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3213{
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 return NULL;
3217 }
3218 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 PyUnicode_GET_SIZE(unicode),
3220 NULL,
3221 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222}
3223
3224/* --- Unicode Escape Codec ----------------------------------------------- */
3225
Fredrik Lundh06d12682001-01-24 07:59:11 +00003226static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003227
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 Py_ssize_t size,
3230 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t startinpos;
3234 Py_ssize_t endinpos;
3235 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003240 char* message;
3241 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 PyObject *errorHandler = NULL;
3243 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 /* Escaped strings will always be longer than the resulting
3246 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 length after conversion to the true value.
3248 (but if the error callback returns a long replacement string
3249 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 v = _PyUnicode_New(size);
3251 if (v == NULL)
3252 goto onError;
3253 if (size == 0)
3254 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003258
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 while (s < end) {
3260 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003261 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263
3264 /* Non-escape characters are interpreted as Unicode ordinals */
3265 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003266 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 continue;
3268 }
3269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 /* \ - Escapes */
3272 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003273 c = *s++;
3274 if (s > end)
3275 c = '\0'; /* Invalid after \ */
3276 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 case '\n': break;
3280 case '\\': *p++ = '\\'; break;
3281 case '\'': *p++ = '\''; break;
3282 case '\"': *p++ = '\"'; break;
3283 case 'b': *p++ = '\b'; break;
3284 case 'f': *p++ = '\014'; break; /* FF */
3285 case 't': *p++ = '\t'; break;
3286 case 'n': *p++ = '\n'; break;
3287 case 'r': *p++ = '\r'; break;
3288 case 'v': *p++ = '\013'; break; /* VT */
3289 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 case '0': case '1': case '2': case '3':
3293 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003294 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003295 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003296 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003297 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003298 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003300 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 break;
3302
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 /* hex escapes */
3304 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003306 digits = 2;
3307 message = "truncated \\xXX escape";
3308 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003312 digits = 4;
3313 message = "truncated \\uXXXX escape";
3314 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003317 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003318 digits = 8;
3319 message = "truncated \\UXXXXXXXX escape";
3320 hexescape:
3321 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 outpos = p-PyUnicode_AS_UNICODE(v);
3323 if (s+digits>end) {
3324 endinpos = size;
3325 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 errors, &errorHandler,
3327 "unicodeescape", "end of string in escape sequence",
3328 &starts, &end, &startinpos, &endinpos, &exc, &s,
3329 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 goto onError;
3331 goto nextByte;
3332 }
3333 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003334 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003335 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 endinpos = (s+i+1)-starts;
3337 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 errors, &errorHandler,
3339 "unicodeescape", message,
3340 &starts, &end, &startinpos, &endinpos, &exc, &s,
3341 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003344 }
3345 chr = (chr<<4) & ~0xF;
3346 if (c >= '0' && c <= '9')
3347 chr += c - '0';
3348 else if (c >= 'a' && c <= 'f')
3349 chr += 10 + c - 'a';
3350 else
3351 chr += 10 + c - 'A';
3352 }
3353 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003354 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 /* _decoding_error will have already written into the
3356 target buffer. */
3357 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003358 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003359 /* when we get here, chr is a 32-bit unicode character */
3360 if (chr <= 0xffff)
3361 /* UCS-2 character */
3362 *p++ = (Py_UNICODE) chr;
3363 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003364 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003365 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003366#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003367 *p++ = chr;
3368#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003369 chr -= 0x10000L;
3370 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003371 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003372#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003373 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 endinpos = s-starts;
3375 outpos = p-PyUnicode_AS_UNICODE(v);
3376 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 errors, &errorHandler,
3378 "unicodeescape", "illegal Unicode character",
3379 &starts, &end, &startinpos, &endinpos, &exc, &s,
3380 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003381 goto onError;
3382 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003383 break;
3384
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003386 case 'N':
3387 message = "malformed \\N character escape";
3388 if (ucnhash_CAPI == NULL) {
3389 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003390 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003391 if (ucnhash_CAPI == NULL)
3392 goto ucnhashError;
3393 }
3394 if (*s == '{') {
3395 const char *start = s+1;
3396 /* look for the closing brace */
3397 while (*s != '}' && s < end)
3398 s++;
3399 if (s > start && s < end && *s == '}') {
3400 /* found a name. look it up in the unicode database */
3401 message = "unknown Unicode character name";
3402 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003403 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003404 goto store;
3405 }
3406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 endinpos = s-starts;
3408 outpos = p-PyUnicode_AS_UNICODE(v);
3409 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 errors, &errorHandler,
3411 "unicodeescape", message,
3412 &starts, &end, &startinpos, &endinpos, &exc, &s,
3413 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003414 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003415 break;
3416
3417 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003418 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 message = "\\ at end of string";
3420 s--;
3421 endinpos = s-starts;
3422 outpos = p-PyUnicode_AS_UNICODE(v);
3423 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 errors, &errorHandler,
3425 "unicodeescape", message,
3426 &starts, &end, &startinpos, &endinpos, &exc, &s,
3427 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003428 goto onError;
3429 }
3430 else {
3431 *p++ = '\\';
3432 *p++ = (unsigned char)s[-1];
3433 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003434 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003439 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003441 Py_XDECREF(errorHandler);
3442 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003444
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003446 PyErr_SetString(
3447 PyExc_UnicodeError,
3448 "\\N escapes not supported (can't load unicodedata module)"
3449 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003450 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 Py_XDECREF(errorHandler);
3452 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003453 return NULL;
3454
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 Py_XDECREF(errorHandler);
3458 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 return NULL;
3460}
3461
3462/* Return a Unicode-Escape string version of the Unicode object.
3463
3464 If quotes is true, the string is enclosed in u"" or u'' quotes as
3465 appropriate.
3466
3467*/
3468
Thomas Wouters477c8d52006-05-27 19:21:47 +00003469Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 Py_ssize_t size,
3471 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003472{
3473 /* like wcschr, but doesn't stop at NULL characters */
3474
3475 while (size-- > 0) {
3476 if (*s == ch)
3477 return s;
3478 s++;
3479 }
3480
3481 return NULL;
3482}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003483
Walter Dörwald79e913e2007-05-12 11:08:06 +00003484static const char *hexdigits = "0123456789abcdef";
3485
3486PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003489 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003492#ifdef Py_UNICODE_WIDE
3493 const Py_ssize_t expandsize = 10;
3494#else
3495 const Py_ssize_t expandsize = 6;
3496#endif
3497
Thomas Wouters89f507f2006-12-13 04:49:30 +00003498 /* XXX(nnorwitz): rather than over-allocating, it would be
3499 better to choose a different scheme. Perhaps scan the
3500 first N-chars of the string and allocate based on that size.
3501 */
3502 /* Initial allocation is based on the longest-possible unichr
3503 escape.
3504
3505 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3506 unichr, so in this case it's the longest unichr escape. In
3507 narrow (UTF-16) builds this is five chars per source unichr
3508 since there are two unichrs in the surrogate pair, so in narrow
3509 (UTF-16) builds it's not the longest unichr escape.
3510
3511 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3512 so in the narrow (UTF-16) build case it's the longest unichr
3513 escape.
3514 */
3515
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003516 if (size == 0)
3517 return PyBytes_FromStringAndSize(NULL, 0);
3518
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003519 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003521
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003522 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 2
3524 + expandsize*size
3525 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 if (repr == NULL)
3527 return NULL;
3528
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003529 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 while (size-- > 0) {
3532 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003533
Walter Dörwald79e913e2007-05-12 11:08:06 +00003534 /* Escape backslashes */
3535 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 *p++ = '\\';
3537 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003538 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003539 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003540
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003541#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003542 /* Map 21-bit characters to '\U00xxxxxx' */
3543 else if (ch >= 0x10000) {
3544 *p++ = '\\';
3545 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003546 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3547 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3548 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3549 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3550 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3551 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3552 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3553 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003555 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003556#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3558 else if (ch >= 0xD800 && ch < 0xDC00) {
3559 Py_UNICODE ch2;
3560 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003561
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 ch2 = *s++;
3563 size--;
3564 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3565 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3566 *p++ = '\\';
3567 *p++ = 'U';
3568 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3569 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3570 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3571 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3572 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3573 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3574 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3575 *p++ = hexdigits[ucs & 0x0000000F];
3576 continue;
3577 }
3578 /* Fall through: isolated surrogates are copied as-is */
3579 s--;
3580 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003581 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003582#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003583
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003585 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 *p++ = '\\';
3587 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003588 *p++ = hexdigits[(ch >> 12) & 0x000F];
3589 *p++ = hexdigits[(ch >> 8) & 0x000F];
3590 *p++ = hexdigits[(ch >> 4) & 0x000F];
3591 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003593
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003594 /* Map special whitespace to '\t', \n', '\r' */
3595 else if (ch == '\t') {
3596 *p++ = '\\';
3597 *p++ = 't';
3598 }
3599 else if (ch == '\n') {
3600 *p++ = '\\';
3601 *p++ = 'n';
3602 }
3603 else if (ch == '\r') {
3604 *p++ = '\\';
3605 *p++ = 'r';
3606 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003607
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003608 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003609 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003611 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003612 *p++ = hexdigits[(ch >> 4) & 0x000F];
3613 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003614 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 /* Copy everything else as-is */
3617 else
3618 *p++ = (char) ch;
3619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003621 assert(p - PyBytes_AS_STRING(repr) > 0);
3622 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3623 return NULL;
3624 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625}
3626
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003627PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003629 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 if (!PyUnicode_Check(unicode)) {
3631 PyErr_BadArgument();
3632 return NULL;
3633 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003634 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3635 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003636 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637}
3638
3639/* --- Raw Unicode Escape Codec ------------------------------------------- */
3640
3641PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 Py_ssize_t size,
3643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003646 Py_ssize_t startinpos;
3647 Py_ssize_t endinpos;
3648 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 const char *end;
3652 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 PyObject *errorHandler = NULL;
3654 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 /* Escaped strings will always be longer than the resulting
3657 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 length after conversion to the true value. (But decoding error
3659 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 v = _PyUnicode_New(size);
3661 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 end = s + size;
3667 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 unsigned char c;
3669 Py_UCS4 x;
3670 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003671 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 /* Non-escape characters are interpreted as Unicode ordinals */
3674 if (*s != '\\') {
3675 *p++ = (unsigned char)*s++;
3676 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 startinpos = s-starts;
3679
3680 /* \u-escapes are only interpreted iff the number of leading
3681 backslashes if odd */
3682 bs = s;
3683 for (;s < end;) {
3684 if (*s != '\\')
3685 break;
3686 *p++ = (unsigned char)*s++;
3687 }
3688 if (((s - bs) & 1) == 0 ||
3689 s >= end ||
3690 (*s != 'u' && *s != 'U')) {
3691 continue;
3692 }
3693 p--;
3694 count = *s=='u' ? 4 : 8;
3695 s++;
3696
3697 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3698 outpos = p-PyUnicode_AS_UNICODE(v);
3699 for (x = 0, i = 0; i < count; ++i, ++s) {
3700 c = (unsigned char)*s;
3701 if (!ISXDIGIT(c)) {
3702 endinpos = s-starts;
3703 if (unicode_decode_call_errorhandler(
3704 errors, &errorHandler,
3705 "rawunicodeescape", "truncated \\uXXXX",
3706 &starts, &end, &startinpos, &endinpos, &exc, &s,
3707 &v, &outpos, &p))
3708 goto onError;
3709 goto nextByte;
3710 }
3711 x = (x<<4) & ~0xF;
3712 if (c >= '0' && c <= '9')
3713 x += c - '0';
3714 else if (c >= 'a' && c <= 'f')
3715 x += 10 + c - 'a';
3716 else
3717 x += 10 + c - 'A';
3718 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003719 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 /* UCS-2 character */
3721 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003722 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 /* UCS-4 character. Either store directly, or as
3724 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003725#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003727#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 x -= 0x10000L;
3729 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3730 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003731#endif
3732 } else {
3733 endinpos = s-starts;
3734 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003735 if (unicode_decode_call_errorhandler(
3736 errors, &errorHandler,
3737 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003738 &starts, &end, &startinpos, &endinpos, &exc, &s,
3739 &v, &outpos, &p))
3740 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003742 nextByte:
3743 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003745 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_XDECREF(errorHandler);
3748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 Py_XDECREF(errorHandler);
3754 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 return NULL;
3756}
3757
3758PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003761 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 char *p;
3763 char *q;
3764
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003765#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003766 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003767#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003768 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003769#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003770
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003771 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003773
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003774 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 if (repr == NULL)
3776 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003777 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003778 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003780 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 while (size-- > 0) {
3782 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003783#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 /* Map 32-bit characters to '\Uxxxxxxxx' */
3785 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003786 *p++ = '\\';
3787 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003788 *p++ = hexdigits[(ch >> 28) & 0xf];
3789 *p++ = hexdigits[(ch >> 24) & 0xf];
3790 *p++ = hexdigits[(ch >> 20) & 0xf];
3791 *p++ = hexdigits[(ch >> 16) & 0xf];
3792 *p++ = hexdigits[(ch >> 12) & 0xf];
3793 *p++ = hexdigits[(ch >> 8) & 0xf];
3794 *p++ = hexdigits[(ch >> 4) & 0xf];
3795 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003796 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003797 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003798#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3800 if (ch >= 0xD800 && ch < 0xDC00) {
3801 Py_UNICODE ch2;
3802 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003803
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 ch2 = *s++;
3805 size--;
3806 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3807 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3808 *p++ = '\\';
3809 *p++ = 'U';
3810 *p++ = hexdigits[(ucs >> 28) & 0xf];
3811 *p++ = hexdigits[(ucs >> 24) & 0xf];
3812 *p++ = hexdigits[(ucs >> 20) & 0xf];
3813 *p++ = hexdigits[(ucs >> 16) & 0xf];
3814 *p++ = hexdigits[(ucs >> 12) & 0xf];
3815 *p++ = hexdigits[(ucs >> 8) & 0xf];
3816 *p++ = hexdigits[(ucs >> 4) & 0xf];
3817 *p++ = hexdigits[ucs & 0xf];
3818 continue;
3819 }
3820 /* Fall through: isolated surrogates are copied as-is */
3821 s--;
3822 size++;
3823 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003824#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003825 /* Map 16-bit characters to '\uxxxx' */
3826 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 *p++ = '\\';
3828 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003829 *p++ = hexdigits[(ch >> 12) & 0xf];
3830 *p++ = hexdigits[(ch >> 8) & 0xf];
3831 *p++ = hexdigits[(ch >> 4) & 0xf];
3832 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 /* Copy everything else as-is */
3835 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 *p++ = (char) ch;
3837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003838 size = p - q;
3839
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003840 assert(size > 0);
3841 if (_PyBytes_Resize(&repr, size) < 0)
3842 return NULL;
3843 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844}
3845
3846PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3847{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003848 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003850 PyErr_BadArgument();
3851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003853 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3854 PyUnicode_GET_SIZE(unicode));
3855
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003856 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857}
3858
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003859/* --- Unicode Internal Codec ------------------------------------------- */
3860
3861PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 Py_ssize_t size,
3863 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003864{
3865 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003866 Py_ssize_t startinpos;
3867 Py_ssize_t endinpos;
3868 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003869 PyUnicodeObject *v;
3870 Py_UNICODE *p;
3871 const char *end;
3872 const char *reason;
3873 PyObject *errorHandler = NULL;
3874 PyObject *exc = NULL;
3875
Neal Norwitzd43069c2006-01-08 01:12:10 +00003876#ifdef Py_UNICODE_WIDE
3877 Py_UNICODE unimax = PyUnicode_GetMax();
3878#endif
3879
Thomas Wouters89f507f2006-12-13 04:49:30 +00003880 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003881 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3882 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003884 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003885 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003886 p = PyUnicode_AS_UNICODE(v);
3887 end = s + size;
3888
3889 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003890 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003891 /* We have to sanity check the raw data, otherwise doom looms for
3892 some malformed UCS-4 data. */
3893 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003894#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003895 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003896#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003897 end-s < Py_UNICODE_SIZE
3898 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003900 startinpos = s - starts;
3901 if (end-s < Py_UNICODE_SIZE) {
3902 endinpos = end-starts;
3903 reason = "truncated input";
3904 }
3905 else {
3906 endinpos = s - starts + Py_UNICODE_SIZE;
3907 reason = "illegal code point (> 0x10FFFF)";
3908 }
3909 outpos = p - PyUnicode_AS_UNICODE(v);
3910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003914 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003915 goto onError;
3916 }
3917 }
3918 else {
3919 p++;
3920 s += Py_UNICODE_SIZE;
3921 }
3922 }
3923
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003924 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003925 goto onError;
3926 Py_XDECREF(errorHandler);
3927 Py_XDECREF(exc);
3928 return (PyObject *)v;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003931 Py_XDECREF(v);
3932 Py_XDECREF(errorHandler);
3933 Py_XDECREF(exc);
3934 return NULL;
3935}
3936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937/* --- Latin-1 Codec ------------------------------------------------------ */
3938
3939PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 Py_ssize_t size,
3941 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
3943 PyUnicodeObject *v;
3944 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003945 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003946
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003948 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 Py_UNICODE r = *(unsigned char*)s;
3950 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003951 }
3952
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 v = _PyUnicode_New(size);
3954 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003959 e = s + size;
3960 /* Unrolling the copy makes it much faster by reducing the looping
3961 overhead. This is similar to what many memcpy() implementations do. */
3962 unrolled_end = e - 4;
3963 while (s < unrolled_end) {
3964 p[0] = (unsigned char) s[0];
3965 p[1] = (unsigned char) s[1];
3966 p[2] = (unsigned char) s[2];
3967 p[3] = (unsigned char) s[3];
3968 s += 4;
3969 p += 4;
3970 }
3971 while (s < e)
3972 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003974
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 Py_XDECREF(v);
3977 return NULL;
3978}
3979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980/* create or adjust a UnicodeEncodeError */
3981static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 const char *encoding,
3983 const Py_UNICODE *unicode, Py_ssize_t size,
3984 Py_ssize_t startpos, Py_ssize_t endpos,
3985 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 *exceptionObject = PyUnicodeEncodeError_Create(
3989 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 }
3991 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3993 goto onError;
3994 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3995 goto onError;
3996 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3997 goto onError;
3998 return;
3999 onError:
4000 Py_DECREF(*exceptionObject);
4001 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 }
4003}
4004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005/* raises a UnicodeEncodeError */
4006static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 const char *encoding,
4008 const Py_UNICODE *unicode, Py_ssize_t size,
4009 Py_ssize_t startpos, Py_ssize_t endpos,
4010 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011{
4012 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016}
4017
4018/* error handling callback helper:
4019 build arguments, call the callback and check the arguments,
4020 put the result into newpos and return the replacement string, which
4021 has to be freed by the caller */
4022static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 PyObject **errorHandler,
4024 const char *encoding, const char *reason,
4025 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4026 Py_ssize_t startpos, Py_ssize_t endpos,
4027 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004029 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030
4031 PyObject *restuple;
4032 PyObject *resunicode;
4033
4034 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 }
4039
4040 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004050 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 Py_DECREF(restuple);
4052 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004054 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 &resunicode, newpos)) {
4056 Py_DECREF(restuple);
4057 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004059 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4060 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4061 Py_DECREF(restuple);
4062 return NULL;
4063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004066 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4068 Py_DECREF(restuple);
4069 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 Py_INCREF(resunicode);
4072 Py_DECREF(restuple);
4073 return resunicode;
4074}
4075
4076static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 Py_ssize_t size,
4078 const char *errors,
4079 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080{
4081 /* output object */
4082 PyObject *res;
4083 /* pointers to the beginning and end+1 of input */
4084 const Py_UNICODE *startp = p;
4085 const Py_UNICODE *endp = p + size;
4086 /* pointer to the beginning of the unencodable characters */
4087 /* const Py_UNICODE *badp = NULL; */
4088 /* pointer into the output */
4089 char *str;
4090 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004091 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004092 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4093 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 PyObject *errorHandler = NULL;
4095 PyObject *exc = NULL;
4096 /* the following variable is used for caching string comparisons
4097 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4098 int known_errorHandler = -1;
4099
4100 /* allocate enough for a simple encoding without
4101 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004102 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004104 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004106 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004107 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 ressize = size;
4109
4110 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 /* can we encode this? */
4114 if (c<limit) {
4115 /* no overflow check, because we know that the space is enough */
4116 *str++ = (char)c;
4117 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004118 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 else {
4120 Py_ssize_t unicodepos = p-startp;
4121 Py_ssize_t requiredsize;
4122 PyObject *repunicode;
4123 Py_ssize_t repsize;
4124 Py_ssize_t newpos;
4125 Py_ssize_t respos;
4126 Py_UNICODE *uni2;
4127 /* startpos for collecting unencodable chars */
4128 const Py_UNICODE *collstart = p;
4129 const Py_UNICODE *collend = p;
4130 /* find all unecodable characters */
4131 while ((collend < endp) && ((*collend)>=limit))
4132 ++collend;
4133 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4134 if (known_errorHandler==-1) {
4135 if ((errors==NULL) || (!strcmp(errors, "strict")))
4136 known_errorHandler = 1;
4137 else if (!strcmp(errors, "replace"))
4138 known_errorHandler = 2;
4139 else if (!strcmp(errors, "ignore"))
4140 known_errorHandler = 3;
4141 else if (!strcmp(errors, "xmlcharrefreplace"))
4142 known_errorHandler = 4;
4143 else
4144 known_errorHandler = 0;
4145 }
4146 switch (known_errorHandler) {
4147 case 1: /* strict */
4148 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4149 goto onError;
4150 case 2: /* replace */
4151 while (collstart++<collend)
4152 *str++ = '?'; /* fall through */
4153 case 3: /* ignore */
4154 p = collend;
4155 break;
4156 case 4: /* xmlcharrefreplace */
4157 respos = str - PyBytes_AS_STRING(res);
4158 /* determine replacement size (temporarily (mis)uses p) */
4159 for (p = collstart, repsize = 0; p < collend; ++p) {
4160 if (*p<10)
4161 repsize += 2+1+1;
4162 else if (*p<100)
4163 repsize += 2+2+1;
4164 else if (*p<1000)
4165 repsize += 2+3+1;
4166 else if (*p<10000)
4167 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004168#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 else
4170 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004171#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 else if (*p<100000)
4173 repsize += 2+5+1;
4174 else if (*p<1000000)
4175 repsize += 2+6+1;
4176 else
4177 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004178#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 }
4180 requiredsize = respos+repsize+(endp-collend);
4181 if (requiredsize > ressize) {
4182 if (requiredsize<2*ressize)
4183 requiredsize = 2*ressize;
4184 if (_PyBytes_Resize(&res, requiredsize))
4185 goto onError;
4186 str = PyBytes_AS_STRING(res) + respos;
4187 ressize = requiredsize;
4188 }
4189 /* generate replacement (temporarily (mis)uses p) */
4190 for (p = collstart; p < collend; ++p) {
4191 str += sprintf(str, "&#%d;", (int)*p);
4192 }
4193 p = collend;
4194 break;
4195 default:
4196 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4197 encoding, reason, startp, size, &exc,
4198 collstart-startp, collend-startp, &newpos);
4199 if (repunicode == NULL)
4200 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004201 if (PyBytes_Check(repunicode)) {
4202 /* Directly copy bytes result to output. */
4203 repsize = PyBytes_Size(repunicode);
4204 if (repsize > 1) {
4205 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004206 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004207 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4208 Py_DECREF(repunicode);
4209 goto onError;
4210 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004211 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004212 ressize += repsize-1;
4213 }
4214 memcpy(str, PyBytes_AsString(repunicode), repsize);
4215 str += repsize;
4216 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004217 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004218 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 /* need more space? (at least enough for what we
4221 have+the replacement+the rest of the string, so
4222 we won't have to check space for encodable characters) */
4223 respos = str - PyBytes_AS_STRING(res);
4224 repsize = PyUnicode_GET_SIZE(repunicode);
4225 requiredsize = respos+repsize+(endp-collend);
4226 if (requiredsize > ressize) {
4227 if (requiredsize<2*ressize)
4228 requiredsize = 2*ressize;
4229 if (_PyBytes_Resize(&res, requiredsize)) {
4230 Py_DECREF(repunicode);
4231 goto onError;
4232 }
4233 str = PyBytes_AS_STRING(res) + respos;
4234 ressize = requiredsize;
4235 }
4236 /* check if there is anything unencodable in the replacement
4237 and copy it to the output */
4238 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4239 c = *uni2;
4240 if (c >= limit) {
4241 raise_encode_exception(&exc, encoding, startp, size,
4242 unicodepos, unicodepos+1, reason);
4243 Py_DECREF(repunicode);
4244 goto onError;
4245 }
4246 *str = (char)c;
4247 }
4248 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004249 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004251 }
4252 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004253 /* Resize if we allocated to much */
4254 size = str - PyBytes_AS_STRING(res);
4255 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004256 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004257 if (_PyBytes_Resize(&res, size) < 0)
4258 goto onError;
4259 }
4260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 Py_XDECREF(errorHandler);
4262 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004263 return res;
4264
4265 onError:
4266 Py_XDECREF(res);
4267 Py_XDECREF(errorHandler);
4268 Py_XDECREF(exc);
4269 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270}
4271
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 Py_ssize_t size,
4274 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277}
4278
4279PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4280{
4281 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 PyErr_BadArgument();
4283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 }
4285 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 PyUnicode_GET_SIZE(unicode),
4287 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288}
4289
4290/* --- 7-bit ASCII Codec -------------------------------------------------- */
4291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 Py_ssize_t size,
4294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 PyUnicodeObject *v;
4298 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004299 Py_ssize_t startinpos;
4300 Py_ssize_t endinpos;
4301 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 const char *e;
4303 PyObject *errorHandler = NULL;
4304 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004307 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 Py_UNICODE r = *(unsigned char*)s;
4309 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004310 }
Tim Petersced69f82003-09-16 20:30:58 +00004311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 v = _PyUnicode_New(size);
4313 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 e = s + size;
4319 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 register unsigned char c = (unsigned char)*s;
4321 if (c < 128) {
4322 *p++ = c;
4323 ++s;
4324 }
4325 else {
4326 startinpos = s-starts;
4327 endinpos = startinpos + 1;
4328 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4329 if (unicode_decode_call_errorhandler(
4330 errors, &errorHandler,
4331 "ascii", "ordinal not in range(128)",
4332 &starts, &e, &startinpos, &endinpos, &exc, &s,
4333 &v, &outpos, &p))
4334 goto onError;
4335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004337 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4339 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 Py_XDECREF(errorHandler);
4341 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 Py_XDECREF(errorHandler);
4347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 return NULL;
4349}
4350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 Py_ssize_t size,
4353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356}
4357
4358PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4359{
4360 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 PyErr_BadArgument();
4362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 }
4364 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 PyUnicode_GET_SIZE(unicode),
4366 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367}
4368
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004369#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004370
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004371/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004372
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004373#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004374#define NEED_RETRY
4375#endif
4376
4377/* XXX This code is limited to "true" double-byte encodings, as
4378 a) it assumes an incomplete character consists of a single byte, and
4379 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004381
4382static int is_dbcs_lead_byte(const char *s, int offset)
4383{
4384 const char *curr = s + offset;
4385
4386 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 const char *prev = CharPrev(s, curr);
4388 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004389 }
4390 return 0;
4391}
4392
4393/*
4394 * Decode MBCS string into unicode object. If 'final' is set, converts
4395 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4396 */
4397static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 const char *s, /* MBCS string */
4399 int size, /* sizeof MBCS string */
4400 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004401{
4402 Py_UNICODE *p;
4403 Py_ssize_t n = 0;
4404 int usize = 0;
4405
4406 assert(size >= 0);
4407
4408 /* Skip trailing lead-byte unless 'final' is set */
4409 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004411
4412 /* First get the size of the result */
4413 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4415 if (usize == 0) {
4416 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4417 return -1;
4418 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004419 }
4420
4421 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 /* Create unicode object */
4423 *v = _PyUnicode_New(usize);
4424 if (*v == NULL)
4425 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004426 }
4427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 /* Extend unicode object */
4429 n = PyUnicode_GET_SIZE(*v);
4430 if (_PyUnicode_Resize(v, n + usize) < 0)
4431 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004432 }
4433
4434 /* Do the conversion */
4435 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 p = PyUnicode_AS_UNICODE(*v) + n;
4437 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4438 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4439 return -1;
4440 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004441 }
4442
4443 return size;
4444}
4445
4446PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 Py_ssize_t size,
4448 const char *errors,
4449 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004450{
4451 PyUnicodeObject *v = NULL;
4452 int done;
4453
4454 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004456
4457#ifdef NEED_RETRY
4458 retry:
4459 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004461 else
4462#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004464
4465 if (done < 0) {
4466 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004468 }
4469
4470 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004472
4473#ifdef NEED_RETRY
4474 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 s += done;
4476 size -= done;
4477 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004478 }
4479#endif
4480
4481 return (PyObject *)v;
4482}
4483
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004484PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 Py_ssize_t size,
4486 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004487{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004488 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4489}
4490
4491/*
4492 * Convert unicode into string object (MBCS).
4493 * Returns 0 if succeed, -1 otherwise.
4494 */
4495static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 const Py_UNICODE *p, /* unicode */
4497 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498{
4499 int mbcssize = 0;
4500 Py_ssize_t n = 0;
4501
4502 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004503
4504 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004505 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4507 if (mbcssize == 0) {
4508 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4509 return -1;
4510 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004511 }
4512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004513 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 /* Create string object */
4515 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4516 if (*repr == NULL)
4517 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004518 }
4519 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 /* Extend string object */
4521 n = PyBytes_Size(*repr);
4522 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4523 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004524 }
4525
4526 /* Do the conversion */
4527 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 char *s = PyBytes_AS_STRING(*repr) + n;
4529 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4530 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4531 return -1;
4532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004533 }
4534
4535 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004536}
4537
4538PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 Py_ssize_t size,
4540 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004541{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004542 PyObject *repr = NULL;
4543 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004544
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549 else
4550#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004552
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004553 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 Py_XDECREF(repr);
4555 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004556 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557
4558#ifdef NEED_RETRY
4559 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 p += INT_MAX;
4561 size -= INT_MAX;
4562 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004563 }
4564#endif
4565
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004566 return repr;
4567}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004568
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004569PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4570{
4571 if (!PyUnicode_Check(unicode)) {
4572 PyErr_BadArgument();
4573 return NULL;
4574 }
4575 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 PyUnicode_GET_SIZE(unicode),
4577 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004578}
4579
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580#undef NEED_RETRY
4581
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004582#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584/* --- Character Mapping Codec -------------------------------------------- */
4585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_ssize_t size,
4588 PyObject *mapping,
4589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t startinpos;
4593 Py_ssize_t endinpos;
4594 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 PyUnicodeObject *v;
4597 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004598 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 PyObject *errorHandler = NULL;
4600 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004601 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 /* Default to Latin-1 */
4605 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 v = _PyUnicode_New(size);
4609 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004615 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 mapstring = PyUnicode_AS_UNICODE(mapping);
4617 maplen = PyUnicode_GET_SIZE(mapping);
4618 while (s < e) {
4619 unsigned char ch = *s;
4620 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 if (ch < maplen)
4623 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 if (x == 0xfffe) {
4626 /* undefined mapping */
4627 outpos = p-PyUnicode_AS_UNICODE(v);
4628 startinpos = s-starts;
4629 endinpos = startinpos+1;
4630 if (unicode_decode_call_errorhandler(
4631 errors, &errorHandler,
4632 "charmap", "character maps to <undefined>",
4633 &starts, &e, &startinpos, &endinpos, &exc, &s,
4634 &v, &outpos, &p)) {
4635 goto onError;
4636 }
4637 continue;
4638 }
4639 *p++ = x;
4640 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004641 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004642 }
4643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 while (s < e) {
4645 unsigned char ch = *s;
4646 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004647
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4649 w = PyLong_FromLong((long)ch);
4650 if (w == NULL)
4651 goto onError;
4652 x = PyObject_GetItem(mapping, w);
4653 Py_DECREF(w);
4654 if (x == NULL) {
4655 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4656 /* No mapping found means: mapping is undefined. */
4657 PyErr_Clear();
4658 x = Py_None;
4659 Py_INCREF(x);
4660 } else
4661 goto onError;
4662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004663
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 /* Apply mapping */
4665 if (PyLong_Check(x)) {
4666 long value = PyLong_AS_LONG(x);
4667 if (value < 0 || value > 65535) {
4668 PyErr_SetString(PyExc_TypeError,
4669 "character mapping must be in range(65536)");
4670 Py_DECREF(x);
4671 goto onError;
4672 }
4673 *p++ = (Py_UNICODE)value;
4674 }
4675 else if (x == Py_None) {
4676 /* undefined mapping */
4677 outpos = p-PyUnicode_AS_UNICODE(v);
4678 startinpos = s-starts;
4679 endinpos = startinpos+1;
4680 if (unicode_decode_call_errorhandler(
4681 errors, &errorHandler,
4682 "charmap", "character maps to <undefined>",
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
4684 &v, &outpos, &p)) {
4685 Py_DECREF(x);
4686 goto onError;
4687 }
4688 Py_DECREF(x);
4689 continue;
4690 }
4691 else if (PyUnicode_Check(x)) {
4692 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004693
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 if (targetsize == 1)
4695 /* 1-1 mapping */
4696 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004697
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 else if (targetsize > 1) {
4699 /* 1-n mapping */
4700 if (targetsize > extrachars) {
4701 /* resize first */
4702 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4703 Py_ssize_t needed = (targetsize - extrachars) + \
4704 (targetsize << 2);
4705 extrachars += needed;
4706 /* XXX overflow detection missing */
4707 if (_PyUnicode_Resize(&v,
4708 PyUnicode_GET_SIZE(v) + needed) < 0) {
4709 Py_DECREF(x);
4710 goto onError;
4711 }
4712 p = PyUnicode_AS_UNICODE(v) + oldpos;
4713 }
4714 Py_UNICODE_COPY(p,
4715 PyUnicode_AS_UNICODE(x),
4716 targetsize);
4717 p += targetsize;
4718 extrachars -= targetsize;
4719 }
4720 /* 1-0 mapping: skip the character */
4721 }
4722 else {
4723 /* wrong return value */
4724 PyErr_SetString(PyExc_TypeError,
4725 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004726 Py_DECREF(x);
4727 goto onError;
4728 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 Py_DECREF(x);
4730 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 }
4733 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(errorHandler);
4737 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004739
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_XDECREF(v);
4744 return NULL;
4745}
4746
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004747/* Charmap encoding: the lookup table */
4748
4749struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 PyObject_HEAD
4751 unsigned char level1[32];
4752 int count2, count3;
4753 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004754};
4755
4756static PyObject*
4757encoding_map_size(PyObject *obj, PyObject* args)
4758{
4759 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004760 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004762}
4763
4764static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 PyDoc_STR("Return the size (in bytes) of this object") },
4767 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004768};
4769
4770static void
4771encoding_map_dealloc(PyObject* o)
4772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004773 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004774}
4775
4776static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 "EncodingMap", /*tp_name*/
4779 sizeof(struct encoding_map), /*tp_basicsize*/
4780 0, /*tp_itemsize*/
4781 /* methods */
4782 encoding_map_dealloc, /*tp_dealloc*/
4783 0, /*tp_print*/
4784 0, /*tp_getattr*/
4785 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004786 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 0, /*tp_repr*/
4788 0, /*tp_as_number*/
4789 0, /*tp_as_sequence*/
4790 0, /*tp_as_mapping*/
4791 0, /*tp_hash*/
4792 0, /*tp_call*/
4793 0, /*tp_str*/
4794 0, /*tp_getattro*/
4795 0, /*tp_setattro*/
4796 0, /*tp_as_buffer*/
4797 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4798 0, /*tp_doc*/
4799 0, /*tp_traverse*/
4800 0, /*tp_clear*/
4801 0, /*tp_richcompare*/
4802 0, /*tp_weaklistoffset*/
4803 0, /*tp_iter*/
4804 0, /*tp_iternext*/
4805 encoding_map_methods, /*tp_methods*/
4806 0, /*tp_members*/
4807 0, /*tp_getset*/
4808 0, /*tp_base*/
4809 0, /*tp_dict*/
4810 0, /*tp_descr_get*/
4811 0, /*tp_descr_set*/
4812 0, /*tp_dictoffset*/
4813 0, /*tp_init*/
4814 0, /*tp_alloc*/
4815 0, /*tp_new*/
4816 0, /*tp_free*/
4817 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004818};
4819
4820PyObject*
4821PyUnicode_BuildEncodingMap(PyObject* string)
4822{
4823 Py_UNICODE *decode;
4824 PyObject *result;
4825 struct encoding_map *mresult;
4826 int i;
4827 int need_dict = 0;
4828 unsigned char level1[32];
4829 unsigned char level2[512];
4830 unsigned char *mlevel1, *mlevel2, *mlevel3;
4831 int count2 = 0, count3 = 0;
4832
4833 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4834 PyErr_BadArgument();
4835 return NULL;
4836 }
4837 decode = PyUnicode_AS_UNICODE(string);
4838 memset(level1, 0xFF, sizeof level1);
4839 memset(level2, 0xFF, sizeof level2);
4840
4841 /* If there isn't a one-to-one mapping of NULL to \0,
4842 or if there are non-BMP characters, we need to use
4843 a mapping dictionary. */
4844 if (decode[0] != 0)
4845 need_dict = 1;
4846 for (i = 1; i < 256; i++) {
4847 int l1, l2;
4848 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004849#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004850 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004851#endif
4852 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004853 need_dict = 1;
4854 break;
4855 }
4856 if (decode[i] == 0xFFFE)
4857 /* unmapped character */
4858 continue;
4859 l1 = decode[i] >> 11;
4860 l2 = decode[i] >> 7;
4861 if (level1[l1] == 0xFF)
4862 level1[l1] = count2++;
4863 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004864 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004865 }
4866
4867 if (count2 >= 0xFF || count3 >= 0xFF)
4868 need_dict = 1;
4869
4870 if (need_dict) {
4871 PyObject *result = PyDict_New();
4872 PyObject *key, *value;
4873 if (!result)
4874 return NULL;
4875 for (i = 0; i < 256; i++) {
4876 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004877 key = PyLong_FromLong(decode[i]);
4878 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004879 if (!key || !value)
4880 goto failed1;
4881 if (PyDict_SetItem(result, key, value) == -1)
4882 goto failed1;
4883 Py_DECREF(key);
4884 Py_DECREF(value);
4885 }
4886 return result;
4887 failed1:
4888 Py_XDECREF(key);
4889 Py_XDECREF(value);
4890 Py_DECREF(result);
4891 return NULL;
4892 }
4893
4894 /* Create a three-level trie */
4895 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4896 16*count2 + 128*count3 - 1);
4897 if (!result)
4898 return PyErr_NoMemory();
4899 PyObject_Init(result, &EncodingMapType);
4900 mresult = (struct encoding_map*)result;
4901 mresult->count2 = count2;
4902 mresult->count3 = count3;
4903 mlevel1 = mresult->level1;
4904 mlevel2 = mresult->level23;
4905 mlevel3 = mresult->level23 + 16*count2;
4906 memcpy(mlevel1, level1, 32);
4907 memset(mlevel2, 0xFF, 16*count2);
4908 memset(mlevel3, 0, 128*count3);
4909 count3 = 0;
4910 for (i = 1; i < 256; i++) {
4911 int o1, o2, o3, i2, i3;
4912 if (decode[i] == 0xFFFE)
4913 /* unmapped character */
4914 continue;
4915 o1 = decode[i]>>11;
4916 o2 = (decode[i]>>7) & 0xF;
4917 i2 = 16*mlevel1[o1] + o2;
4918 if (mlevel2[i2] == 0xFF)
4919 mlevel2[i2] = count3++;
4920 o3 = decode[i] & 0x7F;
4921 i3 = 128*mlevel2[i2] + o3;
4922 mlevel3[i3] = i;
4923 }
4924 return result;
4925}
4926
4927static int
4928encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4929{
4930 struct encoding_map *map = (struct encoding_map*)mapping;
4931 int l1 = c>>11;
4932 int l2 = (c>>7) & 0xF;
4933 int l3 = c & 0x7F;
4934 int i;
4935
4936#ifdef Py_UNICODE_WIDE
4937 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004939 }
4940#endif
4941 if (c == 0)
4942 return 0;
4943 /* level 1*/
4944 i = map->level1[l1];
4945 if (i == 0xFF) {
4946 return -1;
4947 }
4948 /* level 2*/
4949 i = map->level23[16*i+l2];
4950 if (i == 0xFF) {
4951 return -1;
4952 }
4953 /* level 3 */
4954 i = map->level23[16*map->count2 + 128*i + l3];
4955 if (i == 0) {
4956 return -1;
4957 }
4958 return i;
4959}
4960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961/* Lookup the character ch in the mapping. If the character
4962 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004963 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965{
Christian Heimes217cfd12007-12-02 14:31:20 +00004966 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 PyObject *x;
4968
4969 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 x = PyObject_GetItem(mapping, w);
4972 Py_DECREF(w);
4973 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4975 /* No mapping found means: mapping is undefined. */
4976 PyErr_Clear();
4977 x = Py_None;
4978 Py_INCREF(x);
4979 return x;
4980 } else
4981 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004983 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004985 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 long value = PyLong_AS_LONG(x);
4987 if (value < 0 || value > 255) {
4988 PyErr_SetString(PyExc_TypeError,
4989 "character mapping must be in range(256)");
4990 Py_DECREF(x);
4991 return NULL;
4992 }
4993 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004995 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* wrong return value */
4999 PyErr_Format(PyExc_TypeError,
5000 "character mapping must return integer, bytes or None, not %.400s",
5001 x->ob_type->tp_name);
5002 Py_DECREF(x);
5003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 }
5005}
5006
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005008charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005009{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005010 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5011 /* exponentially overallocate to minimize reallocations */
5012 if (requiredsize < 2*outsize)
5013 requiredsize = 2*outsize;
5014 if (_PyBytes_Resize(outobj, requiredsize))
5015 return -1;
5016 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005017}
5018
Benjamin Peterson14339b62009-01-31 16:36:08 +00005019typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005021}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005023 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 space is available. Return a new reference to the object that
5025 was put in the output buffer, or Py_None, if the mapping was undefined
5026 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005027 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005029charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005032 PyObject *rep;
5033 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005034 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035
Christian Heimes90aa7642007-12-19 02:45:37 +00005036 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005037 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005039 if (res == -1)
5040 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 if (outsize<requiredsize)
5042 if (charmapencode_resize(outobj, outpos, requiredsize))
5043 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005044 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 outstart[(*outpos)++] = (char)res;
5046 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005047 }
5048
5049 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005052 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 Py_DECREF(rep);
5054 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005055 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 if (PyLong_Check(rep)) {
5057 Py_ssize_t requiredsize = *outpos+1;
5058 if (outsize<requiredsize)
5059 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5060 Py_DECREF(rep);
5061 return enc_EXCEPTION;
5062 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005063 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 else {
5067 const char *repchars = PyBytes_AS_STRING(rep);
5068 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5069 Py_ssize_t requiredsize = *outpos+repsize;
5070 if (outsize<requiredsize)
5071 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5072 Py_DECREF(rep);
5073 return enc_EXCEPTION;
5074 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005075 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 memcpy(outstart + *outpos, repchars, repsize);
5077 *outpos += repsize;
5078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005080 Py_DECREF(rep);
5081 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082}
5083
5084/* handle an error in PyUnicode_EncodeCharmap
5085 Return 0 on success, -1 on error */
5086static
5087int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005088 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005090 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005091 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092{
5093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t repsize;
5095 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 Py_UNICODE *uni2;
5097 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005098 Py_ssize_t collstartpos = *inpos;
5099 Py_ssize_t collendpos = *inpos+1;
5100 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101 char *encoding = "charmap";
5102 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005103 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 /* find all unencodable characters */
5106 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005107 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005108 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 int res = encoding_map_lookup(p[collendpos], mapping);
5110 if (res != -1)
5111 break;
5112 ++collendpos;
5113 continue;
5114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005115
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 rep = charmapencode_lookup(p[collendpos], mapping);
5117 if (rep==NULL)
5118 return -1;
5119 else if (rep!=Py_None) {
5120 Py_DECREF(rep);
5121 break;
5122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005123 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 }
5126 /* cache callback name lookup
5127 * (if not done yet, i.e. it's the first error) */
5128 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if ((errors==NULL) || (!strcmp(errors, "strict")))
5130 *known_errorHandler = 1;
5131 else if (!strcmp(errors, "replace"))
5132 *known_errorHandler = 2;
5133 else if (!strcmp(errors, "ignore"))
5134 *known_errorHandler = 3;
5135 else if (!strcmp(errors, "xmlcharrefreplace"))
5136 *known_errorHandler = 4;
5137 else
5138 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 }
5140 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005141 case 1: /* strict */
5142 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5143 return -1;
5144 case 2: /* replace */
5145 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 x = charmapencode_output('?', mapping, res, respos);
5147 if (x==enc_EXCEPTION) {
5148 return -1;
5149 }
5150 else if (x==enc_FAILED) {
5151 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5152 return -1;
5153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005154 }
5155 /* fall through */
5156 case 3: /* ignore */
5157 *inpos = collendpos;
5158 break;
5159 case 4: /* xmlcharrefreplace */
5160 /* generate replacement (temporarily (mis)uses p) */
5161 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 char buffer[2+29+1+1];
5163 char *cp;
5164 sprintf(buffer, "&#%d;", (int)p[collpos]);
5165 for (cp = buffer; *cp; ++cp) {
5166 x = charmapencode_output(*cp, mapping, res, respos);
5167 if (x==enc_EXCEPTION)
5168 return -1;
5169 else if (x==enc_FAILED) {
5170 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5171 return -1;
5172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173 }
5174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005175 *inpos = collendpos;
5176 break;
5177 default:
5178 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 encoding, reason, p, size, exceptionObject,
5180 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005181 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005183 if (PyBytes_Check(repunicode)) {
5184 /* Directly copy bytes result to output. */
5185 Py_ssize_t outsize = PyBytes_Size(*res);
5186 Py_ssize_t requiredsize;
5187 repsize = PyBytes_Size(repunicode);
5188 requiredsize = *respos + repsize;
5189 if (requiredsize > outsize)
5190 /* Make room for all additional bytes. */
5191 if (charmapencode_resize(res, respos, requiredsize)) {
5192 Py_DECREF(repunicode);
5193 return -1;
5194 }
5195 memcpy(PyBytes_AsString(*res) + *respos,
5196 PyBytes_AsString(repunicode), repsize);
5197 *respos += repsize;
5198 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005199 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005200 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005202 /* generate replacement */
5203 repsize = PyUnicode_GET_SIZE(repunicode);
5204 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 x = charmapencode_output(*uni2, mapping, res, respos);
5206 if (x==enc_EXCEPTION) {
5207 return -1;
5208 }
5209 else if (x==enc_FAILED) {
5210 Py_DECREF(repunicode);
5211 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5212 return -1;
5213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005214 }
5215 *inpos = newpos;
5216 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 }
5218 return 0;
5219}
5220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 PyObject *mapping,
5224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 /* output object */
5227 PyObject *res = NULL;
5228 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232 PyObject *errorHandler = NULL;
5233 PyObject *exc = NULL;
5234 /* the following variable is used for caching string comparisons
5235 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5236 * 3=ignore, 4=xmlcharrefreplace */
5237 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
5239 /* Default to Latin-1 */
5240 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 /* allocate enough for a simple encoding without
5244 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005245 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 if (res == NULL)
5247 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005248 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 /* try to encode it */
5253 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5254 if (x==enc_EXCEPTION) /* error */
5255 goto onError;
5256 if (x==enc_FAILED) { /* unencodable character */
5257 if (charmap_encoding_error(p, size, &inpos, mapping,
5258 &exc,
5259 &known_errorHandler, &errorHandler, errors,
5260 &res, &respos)) {
5261 goto onError;
5262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005263 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 else
5265 /* done with this character => adjust input position */
5266 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005270 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005271 if (_PyBytes_Resize(&res, respos) < 0)
5272 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274 Py_XDECREF(exc);
5275 Py_XDECREF(errorHandler);
5276 return res;
5277
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 Py_XDECREF(res);
5280 Py_XDECREF(exc);
5281 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 return NULL;
5283}
5284
5285PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
5288 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 PyErr_BadArgument();
5290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 }
5292 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 PyUnicode_GET_SIZE(unicode),
5294 mapping,
5295 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296}
5297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298/* create or adjust a UnicodeTranslateError */
5299static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 const Py_UNICODE *unicode, Py_ssize_t size,
5301 Py_ssize_t startpos, Py_ssize_t endpos,
5302 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
5308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5310 goto onError;
5311 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5312 goto onError;
5313 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5314 goto onError;
5315 return;
5316 onError:
5317 Py_DECREF(*exceptionObject);
5318 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
5320}
5321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322/* raises a UnicodeTranslateError */
5323static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 const Py_UNICODE *unicode, Py_ssize_t size,
5325 Py_ssize_t startpos, Py_ssize_t endpos,
5326 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327{
5328 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332}
5333
5334/* error handling callback helper:
5335 build arguments, call the callback and check the arguments,
5336 put the result into newpos and return the replacement string, which
5337 has to be freed by the caller */
5338static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 PyObject **errorHandler,
5340 const char *reason,
5341 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5342 Py_ssize_t startpos, Py_ssize_t endpos,
5343 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005345 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005347 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 PyObject *restuple;
5349 PyObject *resunicode;
5350
5351 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 }
5356
5357 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361
5362 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005367 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 Py_DECREF(restuple);
5369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 }
5371 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 &resunicode, &i_newpos)) {
5373 Py_DECREF(restuple);
5374 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005378 else
5379 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005380 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5382 Py_DECREF(restuple);
5383 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 Py_INCREF(resunicode);
5386 Py_DECREF(restuple);
5387 return resunicode;
5388}
5389
5390/* Lookup the character ch in the mapping and put the result in result,
5391 which must be decrefed by the caller.
5392 Return 0 on success, -1 on error */
5393static
5394int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5395{
Christian Heimes217cfd12007-12-02 14:31:20 +00005396 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 PyObject *x;
5398
5399 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 x = PyObject_GetItem(mapping, w);
5402 Py_DECREF(w);
5403 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5405 /* No mapping found means: use 1:1 mapping. */
5406 PyErr_Clear();
5407 *result = NULL;
5408 return 0;
5409 } else
5410 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 }
5412 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *result = x;
5414 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005416 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 long value = PyLong_AS_LONG(x);
5418 long max = PyUnicode_GetMax();
5419 if (value < 0 || value > max) {
5420 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005421 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 Py_DECREF(x);
5423 return -1;
5424 }
5425 *result = x;
5426 return 0;
5427 }
5428 else if (PyUnicode_Check(x)) {
5429 *result = x;
5430 return 0;
5431 }
5432 else {
5433 /* wrong return value */
5434 PyErr_SetString(PyExc_TypeError,
5435 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005436 Py_DECREF(x);
5437 return -1;
5438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439}
5440/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 if not reallocate and adjust various state variables.
5442 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443static
Walter Dörwald4894c302003-10-24 14:25:28 +00005444int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005447 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005448 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 /* remember old output position */
5450 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5451 /* exponentially overallocate to minimize reallocations */
5452 if (requiredsize < 2 * oldsize)
5453 requiredsize = 2 * oldsize;
5454 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5455 return -1;
5456 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 }
5458 return 0;
5459}
5460/* lookup the character, put the result in the output string and adjust
5461 various state variables. Return a new reference to the object that
5462 was put in the output buffer in *result, or Py_None, if the mapping was
5463 undefined (in which case no character was written).
5464 The called must decref result.
5465 Return 0 on success, -1 on error. */
5466static
Walter Dörwald4894c302003-10-24 14:25:28 +00005467int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5469 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470{
Walter Dörwald4894c302003-10-24 14:25:28 +00005471 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 /* not found => default to 1:1 mapping */
5475 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 }
5477 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005479 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 /* no overflow check, because we know that the space is enough */
5481 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 }
5483 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5485 if (repsize==1) {
5486 /* no overflow check, because we know that the space is enough */
5487 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5488 }
5489 else if (repsize!=0) {
5490 /* more than one character */
5491 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5492 (insize - (curinp-startinp)) +
5493 repsize - 1;
5494 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5495 return -1;
5496 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5497 *outp += repsize;
5498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 }
5500 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 return 0;
5503}
5504
5505PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 Py_ssize_t size,
5507 PyObject *mapping,
5508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 /* output object */
5511 PyObject *res = NULL;
5512 /* pointers to the beginning and end+1 of input */
5513 const Py_UNICODE *startp = p;
5514 const Py_UNICODE *endp = p + size;
5515 /* pointer into the output */
5516 Py_UNICODE *str;
5517 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 char *reason = "character maps to <undefined>";
5520 PyObject *errorHandler = NULL;
5521 PyObject *exc = NULL;
5522 /* the following variable is used for caching string comparisons
5523 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5524 * 3=ignore, 4=xmlcharrefreplace */
5525 int known_errorHandler = -1;
5526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 PyErr_BadArgument();
5529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531
5532 /* allocate enough for a simple 1:1 translation without
5533 replacements, if we need more, we'll resize */
5534 res = PyUnicode_FromUnicode(NULL, size);
5535 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 /* try to encode it */
5543 PyObject *x = NULL;
5544 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5545 Py_XDECREF(x);
5546 goto onError;
5547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005548 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 if (x!=Py_None) /* it worked => adjust input pointer */
5550 ++p;
5551 else { /* untranslatable character */
5552 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5553 Py_ssize_t repsize;
5554 Py_ssize_t newpos;
5555 Py_UNICODE *uni2;
5556 /* startpos for collecting untranslatable chars */
5557 const Py_UNICODE *collstart = p;
5558 const Py_UNICODE *collend = p+1;
5559 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 /* find all untranslatable characters */
5562 while (collend < endp) {
5563 if (charmaptranslate_lookup(*collend, mapping, &x))
5564 goto onError;
5565 Py_XDECREF(x);
5566 if (x!=Py_None)
5567 break;
5568 ++collend;
5569 }
5570 /* cache callback name lookup
5571 * (if not done yet, i.e. it's the first error) */
5572 if (known_errorHandler==-1) {
5573 if ((errors==NULL) || (!strcmp(errors, "strict")))
5574 known_errorHandler = 1;
5575 else if (!strcmp(errors, "replace"))
5576 known_errorHandler = 2;
5577 else if (!strcmp(errors, "ignore"))
5578 known_errorHandler = 3;
5579 else if (!strcmp(errors, "xmlcharrefreplace"))
5580 known_errorHandler = 4;
5581 else
5582 known_errorHandler = 0;
5583 }
5584 switch (known_errorHandler) {
5585 case 1: /* strict */
5586 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005587 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 case 2: /* replace */
5589 /* No need to check for space, this is a 1:1 replacement */
5590 for (coll = collstart; coll<collend; ++coll)
5591 *str++ = '?';
5592 /* fall through */
5593 case 3: /* ignore */
5594 p = collend;
5595 break;
5596 case 4: /* xmlcharrefreplace */
5597 /* generate replacement (temporarily (mis)uses p) */
5598 for (p = collstart; p < collend; ++p) {
5599 char buffer[2+29+1+1];
5600 char *cp;
5601 sprintf(buffer, "&#%d;", (int)*p);
5602 if (charmaptranslate_makespace(&res, &str,
5603 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5604 goto onError;
5605 for (cp = buffer; *cp; ++cp)
5606 *str++ = *cp;
5607 }
5608 p = collend;
5609 break;
5610 default:
5611 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5612 reason, startp, size, &exc,
5613 collstart-startp, collend-startp, &newpos);
5614 if (repunicode == NULL)
5615 goto onError;
5616 /* generate replacement */
5617 repsize = PyUnicode_GET_SIZE(repunicode);
5618 if (charmaptranslate_makespace(&res, &str,
5619 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5620 Py_DECREF(repunicode);
5621 goto onError;
5622 }
5623 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5624 *str++ = *uni2;
5625 p = startp + newpos;
5626 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005628 }
5629 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 /* Resize if we allocated to much */
5631 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005632 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 if (PyUnicode_Resize(&res, respos) < 0)
5634 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 }
5636 Py_XDECREF(exc);
5637 Py_XDECREF(errorHandler);
5638 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 Py_XDECREF(res);
5642 Py_XDECREF(exc);
5643 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return NULL;
5645}
5646
5647PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 PyObject *mapping,
5649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650{
5651 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 str = PyUnicode_FromObject(str);
5654 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 PyUnicode_GET_SIZE(str),
5658 mapping,
5659 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 Py_DECREF(str);
5661 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005662
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 Py_XDECREF(str);
5665 return NULL;
5666}
Tim Petersced69f82003-09-16 20:30:58 +00005667
Guido van Rossum9e896b32000-04-05 20:11:21 +00005668/* --- Decimal Encoder ---------------------------------------------------- */
5669
5670int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 Py_ssize_t length,
5672 char *output,
5673 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005674{
5675 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 PyObject *errorHandler = NULL;
5677 PyObject *exc = NULL;
5678 const char *encoding = "decimal";
5679 const char *reason = "invalid decimal Unicode string";
5680 /* the following variable is used for caching string comparisons
5681 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5682 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005683
5684 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 PyErr_BadArgument();
5686 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005687 }
5688
5689 p = s;
5690 end = s + length;
5691 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 register Py_UNICODE ch = *p;
5693 int decimal;
5694 PyObject *repunicode;
5695 Py_ssize_t repsize;
5696 Py_ssize_t newpos;
5697 Py_UNICODE *uni2;
5698 Py_UNICODE *collstart;
5699 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005702 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 ++p;
5704 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 decimal = Py_UNICODE_TODECIMAL(ch);
5707 if (decimal >= 0) {
5708 *output++ = '0' + decimal;
5709 ++p;
5710 continue;
5711 }
5712 if (0 < ch && ch < 256) {
5713 *output++ = (char)ch;
5714 ++p;
5715 continue;
5716 }
5717 /* All other characters are considered unencodable */
5718 collstart = p;
5719 collend = p+1;
5720 while (collend < end) {
5721 if ((0 < *collend && *collend < 256) ||
5722 !Py_UNICODE_ISSPACE(*collend) ||
5723 Py_UNICODE_TODECIMAL(*collend))
5724 break;
5725 }
5726 /* cache callback name lookup
5727 * (if not done yet, i.e. it's the first error) */
5728 if (known_errorHandler==-1) {
5729 if ((errors==NULL) || (!strcmp(errors, "strict")))
5730 known_errorHandler = 1;
5731 else if (!strcmp(errors, "replace"))
5732 known_errorHandler = 2;
5733 else if (!strcmp(errors, "ignore"))
5734 known_errorHandler = 3;
5735 else if (!strcmp(errors, "xmlcharrefreplace"))
5736 known_errorHandler = 4;
5737 else
5738 known_errorHandler = 0;
5739 }
5740 switch (known_errorHandler) {
5741 case 1: /* strict */
5742 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5743 goto onError;
5744 case 2: /* replace */
5745 for (p = collstart; p < collend; ++p)
5746 *output++ = '?';
5747 /* fall through */
5748 case 3: /* ignore */
5749 p = collend;
5750 break;
5751 case 4: /* xmlcharrefreplace */
5752 /* generate replacement (temporarily (mis)uses p) */
5753 for (p = collstart; p < collend; ++p)
5754 output += sprintf(output, "&#%d;", (int)*p);
5755 p = collend;
5756 break;
5757 default:
5758 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5759 encoding, reason, s, length, &exc,
5760 collstart-s, collend-s, &newpos);
5761 if (repunicode == NULL)
5762 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005763 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005764 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005765 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5766 Py_DECREF(repunicode);
5767 goto onError;
5768 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 /* generate replacement */
5770 repsize = PyUnicode_GET_SIZE(repunicode);
5771 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5772 Py_UNICODE ch = *uni2;
5773 if (Py_UNICODE_ISSPACE(ch))
5774 *output++ = ' ';
5775 else {
5776 decimal = Py_UNICODE_TODECIMAL(ch);
5777 if (decimal >= 0)
5778 *output++ = '0' + decimal;
5779 else if (0 < ch && ch < 256)
5780 *output++ = (char)ch;
5781 else {
5782 Py_DECREF(repunicode);
5783 raise_encode_exception(&exc, encoding,
5784 s, length, collstart-s, collend-s, reason);
5785 goto onError;
5786 }
5787 }
5788 }
5789 p = s + newpos;
5790 Py_DECREF(repunicode);
5791 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005792 }
5793 /* 0-terminate the output string */
5794 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_XDECREF(exc);
5796 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005797 return 0;
5798
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 Py_XDECREF(exc);
5801 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005802 return -1;
5803}
5804
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805/* --- Helpers ------------------------------------------------------------ */
5806
Eric Smith8c663262007-08-25 02:26:07 +00005807#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005808#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005809#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005810/* Include _ParseTupleFinds from find.h */
5811#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005812#include "stringlib/find.h"
5813#include "stringlib/partition.h"
5814
Eric Smith5807c412008-05-11 21:00:57 +00005815#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005816#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005817#include "stringlib/localeutil.h"
5818
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819/* helper macro to fixup start/end slice values */
5820#define FIX_START_END(obj) \
5821 if (start < 0) \
5822 start += (obj)->length; \
5823 if (start < 0) \
5824 start = 0; \
5825 if (end > (obj)->length) \
5826 end = (obj)->length; \
5827 if (end < 0) \
5828 end += (obj)->length; \
5829 if (end < 0) \
5830 end = 0;
5831
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 PyObject *substr,
5834 Py_ssize_t start,
5835 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005838 PyUnicodeObject* str_obj;
5839 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005840
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5842 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5845 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 Py_DECREF(str_obj);
5847 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
Tim Petersced69f82003-09-16 20:30:58 +00005849
Thomas Wouters477c8d52006-05-27 19:21:47 +00005850 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005851
Thomas Wouters477c8d52006-05-27 19:21:47 +00005852 result = stringlib_count(
5853 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5854 );
5855
5856 Py_DECREF(sub_obj);
5857 Py_DECREF(str_obj);
5858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return result;
5860}
5861
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 PyObject *sub,
5864 Py_ssize_t start,
5865 Py_ssize_t end,
5866 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005869
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005873 sub = PyUnicode_FromObject(sub);
5874 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 Py_DECREF(str);
5876 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 }
Tim Petersced69f82003-09-16 20:30:58 +00005878
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879 if (direction > 0)
5880 result = stringlib_find_slice(
5881 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5882 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5883 start, end
5884 );
5885 else
5886 result = stringlib_rfind_slice(
5887 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5888 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5889 start, end
5890 );
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893 Py_DECREF(sub);
5894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return result;
5896}
5897
Tim Petersced69f82003-09-16 20:30:58 +00005898static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 PyUnicodeObject *substring,
5901 Py_ssize_t start,
5902 Py_ssize_t end,
5903 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 if (substring->length == 0)
5906 return 1;
5907
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910 end -= substring->length;
5911 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
5914 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 if (Py_UNICODE_MATCH(self, end, substring))
5916 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 } else {
5918 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 }
5921
5922 return 0;
5923}
5924
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 PyObject *substr,
5927 Py_ssize_t start,
5928 Py_ssize_t end,
5929 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005931 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 str = PyUnicode_FromObject(str);
5934 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 substr = PyUnicode_FromObject(substr);
5937 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 Py_DECREF(str);
5939 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Tim Petersced69f82003-09-16 20:30:58 +00005941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 (PyUnicodeObject *)substr,
5944 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 Py_DECREF(str);
5946 Py_DECREF(substr);
5947 return result;
5948}
5949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950/* Apply fixfct filter to the Unicode object self and return a
5951 reference to the modified object */
5952
Tim Petersced69f82003-09-16 20:30:58 +00005953static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957
5958 PyUnicodeObject *u;
5959
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005960 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005963
5964 Py_UNICODE_COPY(u->str, self->str, self->length);
5965
Tim Peters7a29bd52001-09-12 03:03:31 +00005966 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 /* fixfct should return TRUE if it modified the buffer. If
5968 FALSE, return a reference to the original buffer instead
5969 (to save space, not time) */
5970 Py_INCREF(self);
5971 Py_DECREF(u);
5972 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
5974 return (PyObject*) u;
5975}
5976
Tim Petersced69f82003-09-16 20:30:58 +00005977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978int fixupper(PyUnicodeObject *self)
5979{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005980 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 Py_UNICODE *s = self->str;
5982 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005986
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 ch = Py_UNICODE_TOUPPER(*s);
5988 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 *s = ch;
5991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 s++;
5993 }
5994
5995 return status;
5996}
5997
Tim Petersced69f82003-09-16 20:30:58 +00005998static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999int fixlower(PyUnicodeObject *self)
6000{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 Py_UNICODE *s = self->str;
6003 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 ch = Py_UNICODE_TOLOWER(*s);
6009 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 *s = ch;
6012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 s++;
6014 }
6015
6016 return status;
6017}
6018
Tim Petersced69f82003-09-16 20:30:58 +00006019static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020int fixswapcase(PyUnicodeObject *self)
6021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 Py_UNICODE *s = self->str;
6024 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 while (len-- > 0) {
6027 if (Py_UNICODE_ISUPPER(*s)) {
6028 *s = Py_UNICODE_TOLOWER(*s);
6029 status = 1;
6030 } else if (Py_UNICODE_ISLOWER(*s)) {
6031 *s = Py_UNICODE_TOUPPER(*s);
6032 status = 1;
6033 }
6034 s++;
6035 }
6036
6037 return status;
6038}
6039
Tim Petersced69f82003-09-16 20:30:58 +00006040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041int fixcapitalize(PyUnicodeObject *self)
6042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006044 Py_UNICODE *s = self->str;
6045 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006046
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006047 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006049 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 *s = Py_UNICODE_TOUPPER(*s);
6051 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006053 s++;
6054 while (--len > 0) {
6055 if (Py_UNICODE_ISUPPER(*s)) {
6056 *s = Py_UNICODE_TOLOWER(*s);
6057 status = 1;
6058 }
6059 s++;
6060 }
6061 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062}
6063
6064static
6065int fixtitle(PyUnicodeObject *self)
6066{
6067 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6068 register Py_UNICODE *e;
6069 int previous_is_cased;
6070
6071 /* Shortcut for single character strings */
6072 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6074 if (*p != ch) {
6075 *p = ch;
6076 return 1;
6077 }
6078 else
6079 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Tim Petersced69f82003-09-16 20:30:58 +00006081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 e = p + PyUnicode_GET_SIZE(self);
6083 previous_is_cased = 0;
6084 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 if (previous_is_cased)
6088 *p = Py_UNICODE_TOLOWER(ch);
6089 else
6090 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006091
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 if (Py_UNICODE_ISLOWER(ch) ||
6093 Py_UNICODE_ISUPPER(ch) ||
6094 Py_UNICODE_ISTITLE(ch))
6095 previous_is_cased = 1;
6096 else
6097 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
6099 return 1;
6100}
6101
Tim Peters8ce9f162004-08-27 01:49:32 +00006102PyObject *
6103PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104{
Skip Montanaro6543b452004-09-16 03:28:13 +00006105 const Py_UNICODE blank = ' ';
6106 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006107 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006108 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006109 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6110 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006111 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6112 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006113 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006114 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Tim Peters05eba1f2004-08-27 21:32:02 +00006116 fseq = PySequence_Fast(seq, "");
6117 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006118 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006119 }
6120
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006121 /* NOTE: the following code can't call back into Python code,
6122 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006123 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006124
Tim Peters05eba1f2004-08-27 21:32:02 +00006125 seqlen = PySequence_Fast_GET_SIZE(fseq);
6126 /* If empty sequence, return u"". */
6127 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006128 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6129 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006130 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006131 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006132 /* If singleton sequence with an exact Unicode, return that. */
6133 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 item = items[0];
6135 if (PyUnicode_CheckExact(item)) {
6136 Py_INCREF(item);
6137 res = (PyUnicodeObject *)item;
6138 goto Done;
6139 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006140 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006141 else {
6142 /* Set up sep and seplen */
6143 if (separator == NULL) {
6144 sep = &blank;
6145 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006146 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006147 else {
6148 if (!PyUnicode_Check(separator)) {
6149 PyErr_Format(PyExc_TypeError,
6150 "separator: expected str instance,"
6151 " %.80s found",
6152 Py_TYPE(separator)->tp_name);
6153 goto onError;
6154 }
6155 sep = PyUnicode_AS_UNICODE(separator);
6156 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006157 }
6158 }
6159
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006160 /* There are at least two things to join, or else we have a subclass
6161 * of str in the sequence.
6162 * Do a pre-pass to figure out the total amount of space we'll
6163 * need (sz), and see whether all argument are strings.
6164 */
6165 sz = 0;
6166 for (i = 0; i < seqlen; i++) {
6167 const Py_ssize_t old_sz = sz;
6168 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 if (!PyUnicode_Check(item)) {
6170 PyErr_Format(PyExc_TypeError,
6171 "sequence item %zd: expected str instance,"
6172 " %.80s found",
6173 i, Py_TYPE(item)->tp_name);
6174 goto onError;
6175 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006176 sz += PyUnicode_GET_SIZE(item);
6177 if (i != 0)
6178 sz += seplen;
6179 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6180 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006182 goto onError;
6183 }
6184 }
Tim Petersced69f82003-09-16 20:30:58 +00006185
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006186 res = _PyUnicode_New(sz);
6187 if (res == NULL)
6188 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006189
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006190 /* Catenate everything. */
6191 res_p = PyUnicode_AS_UNICODE(res);
6192 for (i = 0; i < seqlen; ++i) {
6193 Py_ssize_t itemlen;
6194 item = items[i];
6195 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 /* Copy item, and maybe the separator. */
6197 if (i) {
6198 Py_UNICODE_COPY(res_p, sep, seplen);
6199 res_p += seplen;
6200 }
6201 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6202 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006203 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006206 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 return (PyObject *)res;
6208
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006210 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006211 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return NULL;
6213}
6214
Tim Petersced69f82003-09-16 20:30:58 +00006215static
6216PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 Py_ssize_t left,
6218 Py_ssize_t right,
6219 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220{
6221 PyUnicodeObject *u;
6222
6223 if (left < 0)
6224 left = 0;
6225 if (right < 0)
6226 right = 0;
6227
Tim Peters7a29bd52001-09-12 03:03:31 +00006228 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 Py_INCREF(self);
6230 return self;
6231 }
6232
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006233 if (left > PY_SSIZE_T_MAX - self->length ||
6234 right > PY_SSIZE_T_MAX - (left + self->length)) {
6235 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6236 return NULL;
6237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 u = _PyUnicode_New(left + self->length + right);
6239 if (u) {
6240 if (left)
6241 Py_UNICODE_FILL(u->str, fill, left);
6242 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6243 if (right)
6244 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6245 }
6246
6247 return u;
6248}
6249
Benjamin Peterson29060642009-01-31 22:14:21 +00006250#define SPLIT_APPEND(data, left, right) \
6251 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6252 if (!str) \
6253 goto onError; \
6254 if (PyList_Append(list, str)) { \
6255 Py_DECREF(str); \
6256 goto onError; \
6257 } \
6258 else \
6259 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static
6262PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 PyObject *list,
6264 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006266 register Py_ssize_t i;
6267 register Py_ssize_t j;
6268 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006270 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006276 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6278 i++;
6279 if (j < i) {
6280 if (maxcount-- <= 0)
6281 break;
6282 SPLIT_APPEND(buf, j, i);
6283 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6284 i++;
6285 j = i;
6286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 }
6291 return list;
6292
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_DECREF(list);
6295 return NULL;
6296}
6297
6298PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006301 register Py_ssize_t i;
6302 register Py_ssize_t j;
6303 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 PyObject *list;
6305 PyObject *str;
6306 Py_UNICODE *data;
6307
6308 string = PyUnicode_FromObject(string);
6309 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 data = PyUnicode_AS_UNICODE(string);
6312 len = PyUnicode_GET_SIZE(string);
6313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 list = PyList_New(0);
6315 if (!list)
6316 goto onError;
6317
6318 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 /* Find a line and append it */
6322 while (i < len && !BLOOM_LINEBREAK(data[i]))
6323 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 if (i < len) {
6328 if (data[i] == '\r' && i + 1 < len &&
6329 data[i+1] == '\n')
6330 i += 2;
6331 else
6332 i++;
6333 if (keepends)
6334 eol = i;
6335 }
6336 SPLIT_APPEND(data, j, eol);
6337 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 }
6339 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 }
6342
6343 Py_DECREF(string);
6344 return list;
6345
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006347 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 Py_DECREF(string);
6349 return NULL;
6350}
6351
Tim Petersced69f82003-09-16 20:30:58 +00006352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 PyObject *list,
6355 Py_UNICODE ch,
6356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 register Py_ssize_t i;
6359 register Py_ssize_t j;
6360 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006362 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 if (buf[i] == ch) {
6366 if (maxcount-- <= 0)
6367 break;
6368 SPLIT_APPEND(buf, j, i);
6369 i = j = i + 1;
6370 } else
6371 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
6373 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
6376 return list;
6377
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 Py_DECREF(list);
6380 return NULL;
6381}
6382
Tim Petersced69f82003-09-16 20:30:58 +00006383static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 PyObject *list,
6386 PyUnicodeObject *substring,
6387 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 register Py_ssize_t i;
6390 register Py_ssize_t j;
6391 Py_ssize_t len = self->length;
6392 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 PyObject *str;
6394
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006395 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (Py_UNICODE_MATCH(self, i, substring)) {
6397 if (maxcount-- <= 0)
6398 break;
6399 SPLIT_APPEND(self->str, j, i);
6400 i = j = i + sublen;
6401 } else
6402 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 }
6404 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407 return list;
6408
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 Py_DECREF(list);
6411 return NULL;
6412}
6413
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006414static
6415PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 PyObject *list,
6417 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006418{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006419 register Py_ssize_t i;
6420 register Py_ssize_t j;
6421 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006422 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006423 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006424
6425 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006427 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006429 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6431 i--;
6432 if (j > i) {
6433 if (maxcount-- <= 0)
6434 break;
6435 SPLIT_APPEND(buf, i + 1, j + 1);
6436 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6437 i--;
6438 j = i;
6439 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006440 }
6441 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006444 if (PyList_Reverse(list) < 0)
6445 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446 return list;
6447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006449 Py_DECREF(list);
6450 return NULL;
6451}
6452
Benjamin Peterson14339b62009-01-31 16:36:08 +00006453static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006454PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 PyObject *list,
6456 Py_UNICODE ch,
6457 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006459 register Py_ssize_t i;
6460 register Py_ssize_t j;
6461 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006462 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006463 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006464
6465 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 if (buf[i] == ch) {
6467 if (maxcount-- <= 0)
6468 break;
6469 SPLIT_APPEND(buf, i + 1, j + 1);
6470 j = i = i - 1;
6471 } else
6472 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006473 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006474 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006476 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (PyList_Reverse(list) < 0)
6478 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479 return list;
6480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006482 Py_DECREF(list);
6483 return NULL;
6484}
6485
Benjamin Peterson14339b62009-01-31 16:36:08 +00006486static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006487PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 PyObject *list,
6489 PyUnicodeObject *substring,
6490 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006491{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 register Py_ssize_t i;
6493 register Py_ssize_t j;
6494 Py_ssize_t len = self->length;
6495 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006496 PyObject *str;
6497
6498 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 if (Py_UNICODE_MATCH(self, i, substring)) {
6500 if (maxcount-- <= 0)
6501 break;
6502 SPLIT_APPEND(self->str, i + sublen, j);
6503 j = i;
6504 i -= sublen;
6505 } else
6506 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006507 }
6508 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 if (PyList_Reverse(list) < 0)
6512 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006513 return list;
6514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006516 Py_DECREF(list);
6517 return NULL;
6518}
6519
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520#undef SPLIT_APPEND
6521
6522static
6523PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 PyUnicodeObject *substring,
6525 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
6527 PyObject *list;
6528
6529 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006530 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
6532 list = PyList_New(0);
6533 if (!list)
6534 return NULL;
6535
6536 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541
6542 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 Py_DECREF(list);
6544 PyErr_SetString(PyExc_ValueError, "empty separator");
6545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 }
6547 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
Tim Petersced69f82003-09-16 20:30:58 +00006551static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006552PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 PyUnicodeObject *substring,
6554 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006555{
6556 PyObject *list;
6557
6558 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006559 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006560
6561 list = PyList_New(0);
6562 if (!list)
6563 return NULL;
6564
6565 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006567
6568 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006570
6571 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 Py_DECREF(list);
6573 PyErr_SetString(PyExc_ValueError, "empty separator");
6574 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006575 }
6576 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006578}
6579
6580static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 PyUnicodeObject *str1,
6583 PyUnicodeObject *str2,
6584 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
6586 PyUnicodeObject *u;
6587
6588 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 if (str1->length == str2->length) {
6592 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006593 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006594 if (str1->length == 1) {
6595 /* replace characters */
6596 Py_UNICODE u1, u2;
6597 if (!findchar(self->str, self->length, str1->str[0]))
6598 goto nothing;
6599 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6600 if (!u)
6601 return NULL;
6602 Py_UNICODE_COPY(u->str, self->str, self->length);
6603 u1 = str1->str[0];
6604 u2 = str2->str[0];
6605 for (i = 0; i < u->length; i++)
6606 if (u->str[i] == u1) {
6607 if (--maxcount < 0)
6608 break;
6609 u->str[i] = u2;
6610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006612 i = fastsearch(
6613 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006615 if (i < 0)
6616 goto nothing;
6617 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6618 if (!u)
6619 return NULL;
6620 Py_UNICODE_COPY(u->str, self->str, self->length);
6621 while (i <= self->length - str1->length)
6622 if (Py_UNICODE_MATCH(self, i, str1)) {
6623 if (--maxcount < 0)
6624 break;
6625 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6626 i += str1->length;
6627 } else
6628 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631
6632 Py_ssize_t n, i, j, e;
6633 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_UNICODE *p;
6635
6636 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006637 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 if (n > maxcount)
6639 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006640 if (n == 0)
6641 goto nothing;
6642 /* new_size = self->length + n * (str2->length - str1->length)); */
6643 delta = (str2->length - str1->length);
6644 if (delta == 0) {
6645 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006647 product = n * (str2->length - str1->length);
6648 if ((product / (str2->length - str1->length)) != n) {
6649 PyErr_SetString(PyExc_OverflowError,
6650 "replace string is too long");
6651 return NULL;
6652 }
6653 new_size = self->length + product;
6654 if (new_size < 0) {
6655 PyErr_SetString(PyExc_OverflowError,
6656 "replace string is too long");
6657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 }
6659 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006660 u = _PyUnicode_New(new_size);
6661 if (!u)
6662 return NULL;
6663 i = 0;
6664 p = u->str;
6665 e = self->length - str1->length;
6666 if (str1->length > 0) {
6667 while (n-- > 0) {
6668 /* look for next match */
6669 j = i;
6670 while (j <= e) {
6671 if (Py_UNICODE_MATCH(self, j, str1))
6672 break;
6673 j++;
6674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 if (j > e)
6677 break;
6678 /* copy unchanged part [i:j] */
6679 Py_UNICODE_COPY(p, self->str+i, j-i);
6680 p += j - i;
6681 }
6682 /* copy substitution string */
6683 if (str2->length > 0) {
6684 Py_UNICODE_COPY(p, str2->str, str2->length);
6685 p += str2->length;
6686 }
6687 i = j + str1->length;
6688 }
6689 if (i < self->length)
6690 /* copy tail [i:] */
6691 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6692 } else {
6693 /* interleave */
6694 while (n > 0) {
6695 Py_UNICODE_COPY(p, str2->str, str2->length);
6696 p += str2->length;
6697 if (--n <= 0)
6698 break;
6699 *p++ = self->str[i++];
6700 }
6701 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006707 /* nothing to replace; return original string (when possible) */
6708 if (PyUnicode_CheckExact(self)) {
6709 Py_INCREF(self);
6710 return (PyObject *) self;
6711 }
6712 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
6715/* --- Unicode Object Methods --------------------------------------------- */
6716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006717PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719\n\
6720Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
6723static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006724unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 return fixup(self, fixtitle);
6727}
6728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731\n\
6732Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006736unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return fixup(self, fixcapitalize);
6739}
6740
6741#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744\n\
6745Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
6751 PyObject *list;
6752 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 /* Split into words */
6756 list = split(self, NULL, -1);
6757 if (!list)
6758 return NULL;
6759
6760 /* Capitalize each word */
6761 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6762 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 if (item == NULL)
6765 goto onError;
6766 Py_DECREF(PyList_GET_ITEM(list, i));
6767 PyList_SET_ITEM(list, i, item);
6768 }
6769
6770 /* Join the words to form a new string */
6771 item = PyUnicode_Join(NULL, list);
6772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 Py_DECREF(list);
6775 return (PyObject *)item;
6776}
6777#endif
6778
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006779/* Argument converter. Coerces to a single unicode character */
6780
6781static int
6782convert_uc(PyObject *obj, void *addr)
6783{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6785 PyObject *uniobj;
6786 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006787
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788 uniobj = PyUnicode_FromObject(obj);
6789 if (uniobj == NULL) {
6790 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 return 0;
6793 }
6794 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6795 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006797 Py_DECREF(uniobj);
6798 return 0;
6799 }
6800 unistr = PyUnicode_AS_UNICODE(uniobj);
6801 *fillcharloc = unistr[0];
6802 Py_DECREF(uniobj);
6803 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006809Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006810done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject *
6813unicode_center(PyUnicodeObject *self, PyObject *args)
6814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t marg, left;
6816 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006817 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Thomas Woutersde017742006-02-16 19:34:37 +00006819 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 return NULL;
6821
Tim Peters7a29bd52001-09-12 03:03:31 +00006822 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 Py_INCREF(self);
6824 return (PyObject*) self;
6825 }
6826
6827 marg = width - self->length;
6828 left = marg / 2 + (marg & width & 1);
6829
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006830 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Marc-André Lemburge5034372000-08-08 08:04:29 +00006833#if 0
6834
6835/* This code should go into some future Unicode collation support
6836 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006837 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006838
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006839/* speedy UTF-16 code point order comparison */
6840/* gleaned from: */
6841/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006843static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006844{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006845 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006846 0, 0, 0, 0, 0, 0, 0, 0,
6847 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006848 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006849};
6850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851static int
6852unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 Py_UNICODE *s1 = str1->str;
6857 Py_UNICODE *s2 = str2->str;
6858
6859 len1 = str1->length;
6860 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006861
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006863 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006864
6865 c1 = *s1++;
6866 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006867
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 if (c1 > (1<<11) * 26)
6869 c1 += utf16Fixup[c1>>11];
6870 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006871 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006872 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006873
6874 if (c1 != c2)
6875 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006876
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006877 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 }
6879
6880 return (len1 < len2) ? -1 : (len1 != len2);
6881}
6882
Marc-André Lemburge5034372000-08-08 08:04:29 +00006883#else
6884
6885static int
6886unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006888 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006889
6890 Py_UNICODE *s1 = str1->str;
6891 Py_UNICODE *s2 = str2->str;
6892
6893 len1 = str1->length;
6894 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006895
Marc-André Lemburge5034372000-08-08 08:04:29 +00006896 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006897 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006898
Fredrik Lundh45714e92001-06-26 16:39:36 +00006899 c1 = *s1++;
6900 c2 = *s2++;
6901
6902 if (c1 != c2)
6903 return (c1 < c2) ? -1 : 1;
6904
Marc-André Lemburge5034372000-08-08 08:04:29 +00006905 len1--; len2--;
6906 }
6907
6908 return (len1 < len2) ? -1 : (len1 != len2);
6909}
6910
6911#endif
6912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006916 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6917 return unicode_compare((PyUnicodeObject *)left,
6918 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006919 PyErr_Format(PyExc_TypeError,
6920 "Can't compare %.100s and %.100s",
6921 left->ob_type->tp_name,
6922 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 return -1;
6924}
6925
Martin v. Löwis5b222132007-06-10 09:51:05 +00006926int
6927PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6928{
6929 int i;
6930 Py_UNICODE *id;
6931 assert(PyUnicode_Check(uni));
6932 id = PyUnicode_AS_UNICODE(uni);
6933 /* Compare Unicode string and source character set string */
6934 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 if (id[i] != str[i])
6936 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006937 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006939 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006941 return 0;
6942}
6943
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006944
Benjamin Peterson29060642009-01-31 22:14:21 +00006945#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006946 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006947
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006948PyObject *PyUnicode_RichCompare(PyObject *left,
6949 PyObject *right,
6950 int op)
6951{
6952 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006953
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006954 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6955 PyObject *v;
6956 if (((PyUnicodeObject *) left)->length !=
6957 ((PyUnicodeObject *) right)->length) {
6958 if (op == Py_EQ) {
6959 Py_INCREF(Py_False);
6960 return Py_False;
6961 }
6962 if (op == Py_NE) {
6963 Py_INCREF(Py_True);
6964 return Py_True;
6965 }
6966 }
6967 if (left == right)
6968 result = 0;
6969 else
6970 result = unicode_compare((PyUnicodeObject *)left,
6971 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006972
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006973 /* Convert the return value to a Boolean */
6974 switch (op) {
6975 case Py_EQ:
6976 v = TEST_COND(result == 0);
6977 break;
6978 case Py_NE:
6979 v = TEST_COND(result != 0);
6980 break;
6981 case Py_LE:
6982 v = TEST_COND(result <= 0);
6983 break;
6984 case Py_GE:
6985 v = TEST_COND(result >= 0);
6986 break;
6987 case Py_LT:
6988 v = TEST_COND(result == -1);
6989 break;
6990 case Py_GT:
6991 v = TEST_COND(result == 1);
6992 break;
6993 default:
6994 PyErr_BadArgument();
6995 return NULL;
6996 }
6997 Py_INCREF(v);
6998 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006999 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007001 Py_INCREF(Py_NotImplemented);
7002 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007003}
7004
Guido van Rossum403d68b2000-03-13 15:55:09 +00007005int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007007{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007008 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007009 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007010
7011 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 sub = PyUnicode_FromObject(element);
7013 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 PyErr_Format(PyExc_TypeError,
7015 "'in <string>' requires string as left operand, not %s",
7016 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007018 }
7019
Thomas Wouters477c8d52006-05-27 19:21:47 +00007020 str = PyUnicode_FromObject(container);
7021 if (!str) {
7022 Py_DECREF(sub);
7023 return -1;
7024 }
7025
7026 result = stringlib_contains_obj(str, sub);
7027
7028 Py_DECREF(str);
7029 Py_DECREF(sub);
7030
Guido van Rossum403d68b2000-03-13 15:55:09 +00007031 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007032}
7033
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034/* Concat to string or Unicode object giving a new Unicode object. */
7035
7036PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
7039 PyUnicodeObject *u = NULL, *v = NULL, *w;
7040
7041 /* Coerce the two arguments */
7042 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7043 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7046 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049 /* Shortcuts */
7050 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 Py_DECREF(v);
7052 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 }
7054 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 Py_DECREF(u);
7056 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 }
7058
7059 /* Concat the two Unicode strings */
7060 w = _PyUnicode_New(u->length + v->length);
7061 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 Py_UNICODE_COPY(w->str, u->str, u->length);
7064 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7065
7066 Py_DECREF(u);
7067 Py_DECREF(v);
7068 return (PyObject *)w;
7069
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 Py_XDECREF(u);
7072 Py_XDECREF(v);
7073 return NULL;
7074}
7075
Walter Dörwald1ab83302007-05-18 17:15:44 +00007076void
7077PyUnicode_Append(PyObject **pleft, PyObject *right)
7078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007079 PyObject *new;
7080 if (*pleft == NULL)
7081 return;
7082 if (right == NULL || !PyUnicode_Check(*pleft)) {
7083 Py_DECREF(*pleft);
7084 *pleft = NULL;
7085 return;
7086 }
7087 new = PyUnicode_Concat(*pleft, right);
7088 Py_DECREF(*pleft);
7089 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007090}
7091
7092void
7093PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7094{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007095 PyUnicode_Append(pleft, right);
7096 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007097}
7098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007099PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007102Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007103string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject *
7107unicode_count(PyUnicodeObject *self, PyObject *args)
7108{
7109 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007110 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007111 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 PyObject *result;
7113
Guido van Rossumb8872e62000-05-09 14:14:27 +00007114 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 return NULL;
7117
7118 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007119 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007122
Thomas Wouters477c8d52006-05-27 19:21:47 +00007123 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124
Christian Heimes217cfd12007-12-02 14:31:20 +00007125 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007126 stringlib_count(self->str + start, end - start,
7127 substring->str, substring->length)
7128 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007131
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 return result;
7133}
7134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007138Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007139to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007140handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7142'xmlcharrefreplace' as well as any other name registered with\n\
7143codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144
7145static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007146unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007148 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 char *encoding = NULL;
7150 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007151 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007152
Benjamin Peterson308d6372009-09-18 21:42:35 +00007153 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7154 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007156 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007157 if (v == NULL)
7158 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007159 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007160 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007161 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007162 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007163 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007164 Py_DECREF(v);
7165 return NULL;
7166 }
7167 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007168
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007170 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007171}
7172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007173PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175\n\
7176Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
7179static PyObject*
7180unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7181{
7182 Py_UNICODE *e;
7183 Py_UNICODE *p;
7184 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007185 Py_UNICODE *qe;
7186 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 PyUnicodeObject *u;
7188 int tabsize = 8;
7189
7190 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
Thomas Wouters7e474022000-07-16 12:04:32 +00007193 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007194 i = 0; /* chars up to and including most recent \n or \r */
7195 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7196 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 for (p = self->str; p < e; p++)
7198 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 if (tabsize > 0) {
7200 incr = tabsize - (j % tabsize); /* cannot overflow */
7201 if (j > PY_SSIZE_T_MAX - incr)
7202 goto overflow1;
7203 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 if (j > PY_SSIZE_T_MAX - 1)
7208 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 j++;
7210 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 if (i > PY_SSIZE_T_MAX - j)
7212 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007214 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 }
7216 }
7217
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007218 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007220
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 /* Second pass: create output string and fill it */
7222 u = _PyUnicode_New(i + j);
7223 if (!u)
7224 return NULL;
7225
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007226 j = 0; /* same as in first pass */
7227 q = u->str; /* next output char */
7228 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229
7230 for (p = self->str; p < e; p++)
7231 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 if (tabsize > 0) {
7233 i = tabsize - (j % tabsize);
7234 j += i;
7235 while (i--) {
7236 if (q >= qe)
7237 goto overflow2;
7238 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 else {
7243 if (q >= qe)
7244 goto overflow2;
7245 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007246 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 if (*p == '\n' || *p == '\r')
7248 j = 0;
7249 }
7250
7251 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007252
7253 overflow2:
7254 Py_DECREF(u);
7255 overflow1:
7256 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258}
7259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007260PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262\n\
7263Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007264such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265arguments start and end are interpreted as in slice notation.\n\
7266\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007267Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
7269static PyObject *
7270unicode_find(PyUnicodeObject *self, PyObject *args)
7271{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007272 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007273 Py_ssize_t start;
7274 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007275 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276
Christian Heimes9cd17752007-11-18 19:35:23 +00007277 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
Thomas Wouters477c8d52006-05-27 19:21:47 +00007280 result = stringlib_find_slice(
7281 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7282 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7283 start, end
7284 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007287
Christian Heimes217cfd12007-12-02 14:31:20 +00007288 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289}
7290
7291static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293{
7294 if (index < 0 || index >= self->length) {
7295 PyErr_SetString(PyExc_IndexError, "string index out of range");
7296 return NULL;
7297 }
7298
7299 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7300}
7301
Guido van Rossumc2504932007-09-18 19:42:40 +00007302/* Believe it or not, this produces the same value for ASCII strings
7303 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007305unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306{
Guido van Rossumc2504932007-09-18 19:42:40 +00007307 Py_ssize_t len;
7308 Py_UNICODE *p;
7309 long x;
7310
7311 if (self->hash != -1)
7312 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007313 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007314 p = self->str;
7315 x = *p << 7;
7316 while (--len >= 0)
7317 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007318 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007319 if (x == -1)
7320 x = -2;
7321 self->hash = x;
7322 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323}
7324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007328Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329
7330static PyObject *
7331unicode_index(PyUnicodeObject *self, PyObject *args)
7332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007333 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007334 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007335 Py_ssize_t start;
7336 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337
Christian Heimes9cd17752007-11-18 19:35:23 +00007338 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
Thomas Wouters477c8d52006-05-27 19:21:47 +00007341 result = stringlib_find_slice(
7342 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7343 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7344 start, end
7345 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
7347 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007348
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 if (result < 0) {
7350 PyErr_SetString(PyExc_ValueError, "substring not found");
7351 return NULL;
7352 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007353
Christian Heimes217cfd12007-12-02 14:31:20 +00007354 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355}
7356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007357PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007360Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362
7363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007364unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365{
7366 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7367 register const Py_UNICODE *e;
7368 int cased;
7369
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370 /* Shortcut for single character strings */
7371 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007374 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007375 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007377
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 e = p + PyUnicode_GET_SIZE(self);
7379 cased = 0;
7380 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007382
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7384 return PyBool_FromLong(0);
7385 else if (!cased && Py_UNICODE_ISLOWER(ch))
7386 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007388 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007391PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007394Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
7397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007398unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399{
7400 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7401 register const Py_UNICODE *e;
7402 int cased;
7403
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 /* Shortcut for single character strings */
7405 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007408 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007409 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007411
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412 e = p + PyUnicode_GET_SIZE(self);
7413 cased = 0;
7414 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007416
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7418 return PyBool_FromLong(0);
7419 else if (!cased && Py_UNICODE_ISUPPER(ch))
7420 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007422 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423}
7424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007425PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007428Return True if S is a titlecased string and there is at least one\n\
7429character in S, i.e. upper- and titlecase characters may only\n\
7430follow uncased characters and lowercase characters only cased ones.\n\
7431Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432
7433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007434unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435{
7436 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7437 register const Py_UNICODE *e;
7438 int cased, previous_is_cased;
7439
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 /* Shortcut for single character strings */
7441 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7443 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007445 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007446 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007448
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 e = p + PyUnicode_GET_SIZE(self);
7450 cased = 0;
7451 previous_is_cased = 0;
7452 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007454
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7456 if (previous_is_cased)
7457 return PyBool_FromLong(0);
7458 previous_is_cased = 1;
7459 cased = 1;
7460 }
7461 else if (Py_UNICODE_ISLOWER(ch)) {
7462 if (!previous_is_cased)
7463 return PyBool_FromLong(0);
7464 previous_is_cased = 1;
7465 cased = 1;
7466 }
7467 else
7468 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007470 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471}
7472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007476Return True if all characters in S are whitespace\n\
7477and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
7479static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007480unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481{
7482 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7483 register const Py_UNICODE *e;
7484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 /* Shortcut for single character strings */
7486 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 Py_UNICODE_ISSPACE(*p))
7488 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007490 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007491 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007493
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 e = p + PyUnicode_GET_SIZE(self);
7495 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 if (!Py_UNICODE_ISSPACE(*p))
7497 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007499 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500}
7501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007502PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007504\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007505Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007507
7508static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007509unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007510{
7511 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7512 register const Py_UNICODE *e;
7513
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007514 /* Shortcut for single character strings */
7515 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 Py_UNICODE_ISALPHA(*p))
7517 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007518
7519 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007520 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007522
7523 e = p + PyUnicode_GET_SIZE(self);
7524 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 if (!Py_UNICODE_ISALPHA(*p))
7526 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007528 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007529}
7530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007533\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007534Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007536
7537static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007538unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007539{
7540 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7541 register const Py_UNICODE *e;
7542
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007543 /* Shortcut for single character strings */
7544 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 Py_UNICODE_ISALNUM(*p))
7546 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007547
7548 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007549 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007551
7552 e = p + PyUnicode_GET_SIZE(self);
7553 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 if (!Py_UNICODE_ISALNUM(*p))
7555 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007556 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007557 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007558}
7559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007563Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
7566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007567unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568{
7569 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7570 register const Py_UNICODE *e;
7571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 /* Shortcut for single character strings */
7573 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 Py_UNICODE_ISDECIMAL(*p))
7575 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007577 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007578 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007580
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 e = p + PyUnicode_GET_SIZE(self);
7582 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 if (!Py_UNICODE_ISDECIMAL(*p))
7584 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007586 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587}
7588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007589PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007592Return True if all characters in S are digits\n\
7593and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594
7595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007596unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
7598 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7599 register const Py_UNICODE *e;
7600
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 /* Shortcut for single character strings */
7602 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 Py_UNICODE_ISDIGIT(*p))
7604 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007606 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007607 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007609
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 e = p + PyUnicode_GET_SIZE(self);
7611 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 if (!Py_UNICODE_ISDIGIT(*p))
7613 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007615 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616}
7617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007621Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007625unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626{
7627 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7628 register const Py_UNICODE *e;
7629
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 /* Shortcut for single character strings */
7631 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 Py_UNICODE_ISNUMERIC(*p))
7633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007635 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007636 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007638
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 e = p + PyUnicode_GET_SIZE(self);
7640 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 if (!Py_UNICODE_ISNUMERIC(*p))
7642 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007644 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
Martin v. Löwis47383402007-08-15 07:32:56 +00007647int
7648PyUnicode_IsIdentifier(PyObject *self)
7649{
7650 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7651 register const Py_UNICODE *e;
7652
7653 /* Special case for empty strings */
7654 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007656
7657 /* PEP 3131 says that the first character must be in
7658 XID_Start and subsequent characters in XID_Continue,
7659 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007661 letters, digits, underscore). However, given the current
7662 definition of XID_Start and XID_Continue, it is sufficient
7663 to check just for these, except that _ must be allowed
7664 as starting an identifier. */
7665 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7666 return 0;
7667
7668 e = p + PyUnicode_GET_SIZE(self);
7669 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 if (!_PyUnicode_IsXidContinue(*p))
7671 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007672 }
7673 return 1;
7674}
7675
7676PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007678\n\
7679Return True if S is a valid identifier according\n\
7680to the language definition.");
7681
7682static PyObject*
7683unicode_isidentifier(PyObject *self)
7684{
7685 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7686}
7687
Georg Brandl559e5d72008-06-11 18:37:52 +00007688PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007690\n\
7691Return True if all characters in S are considered\n\
7692printable in repr() or S is empty, False otherwise.");
7693
7694static PyObject*
7695unicode_isprintable(PyObject *self)
7696{
7697 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7698 register const Py_UNICODE *e;
7699
7700 /* Shortcut for single character strings */
7701 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7702 Py_RETURN_TRUE;
7703 }
7704
7705 e = p + PyUnicode_GET_SIZE(self);
7706 for (; p < e; p++) {
7707 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7708 Py_RETURN_FALSE;
7709 }
7710 }
7711 Py_RETURN_TRUE;
7712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007715 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716\n\
7717Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007718iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007723 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724}
7725
Martin v. Löwis18e16552006-02-15 17:27:45 +00007726static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727unicode_length(PyUnicodeObject *self)
7728{
7729 return self->length;
7730}
7731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007732PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007735Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007736done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
7738static PyObject *
7739unicode_ljust(PyUnicodeObject *self, PyObject *args)
7740{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007741 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007742 Py_UNICODE fillchar = ' ';
7743
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007744 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 return NULL;
7746
Tim Peters7a29bd52001-09-12 03:03:31 +00007747 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 Py_INCREF(self);
7749 return (PyObject*) self;
7750 }
7751
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007752 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753}
7754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007755PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
7760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007761unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 return fixup(self, fixlower);
7764}
7765
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007766#define LEFTSTRIP 0
7767#define RIGHTSTRIP 1
7768#define BOTHSTRIP 2
7769
7770/* Arrays indexed by above */
7771static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7772
7773#define STRIPNAME(i) (stripformat[i]+3)
7774
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007775/* externally visible for str.strip(unicode) */
7776PyObject *
7777_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7780 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7781 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7782 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7783 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007784
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007786
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 i = 0;
7788 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7790 i++;
7791 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007793
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 j = len;
7795 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 do {
7797 j--;
7798 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7799 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007801
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_INCREF(self);
7804 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 }
7806 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808}
7809
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810
7811static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007812do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7815 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 i = 0;
7818 if (striptype != RIGHTSTRIP) {
7819 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7820 i++;
7821 }
7822 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007823
Benjamin Peterson14339b62009-01-31 16:36:08 +00007824 j = len;
7825 if (striptype != LEFTSTRIP) {
7826 do {
7827 j--;
7828 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7829 j++;
7830 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007831
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7833 Py_INCREF(self);
7834 return (PyObject*)self;
7835 }
7836 else
7837 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838}
7839
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007840
7841static PyObject *
7842do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7843{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007845
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7847 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007848
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 if (sep != NULL && sep != Py_None) {
7850 if (PyUnicode_Check(sep))
7851 return _PyUnicode_XStrip(self, striptype, sep);
7852 else {
7853 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 "%s arg must be None or str",
7855 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 return NULL;
7857 }
7858 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007861}
7862
7863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007864PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007866\n\
7867Return a copy of the string S with leading and trailing\n\
7868whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007869If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007870
7871static PyObject *
7872unicode_strip(PyUnicodeObject *self, PyObject *args)
7873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 if (PyTuple_GET_SIZE(args) == 0)
7875 return do_strip(self, BOTHSTRIP); /* Common case */
7876 else
7877 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007878}
7879
7880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007881PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007883\n\
7884Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007885If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007886
7887static PyObject *
7888unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7889{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 if (PyTuple_GET_SIZE(args) == 0)
7891 return do_strip(self, LEFTSTRIP); /* Common case */
7892 else
7893 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007894}
7895
7896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007897PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007899\n\
7900Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007901If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007902
7903static PyObject *
7904unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7905{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007906 if (PyTuple_GET_SIZE(args) == 0)
7907 return do_strip(self, RIGHTSTRIP); /* Common case */
7908 else
7909 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007910}
7911
7912
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007914unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915{
7916 PyUnicodeObject *u;
7917 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007918 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007919 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920
Georg Brandl222de0f2009-04-12 12:01:50 +00007921 if (len < 1) {
7922 Py_INCREF(unicode_empty);
7923 return (PyObject *)unicode_empty;
7924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925
Tim Peters7a29bd52001-09-12 03:03:31 +00007926 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 /* no repeat, return original string */
7928 Py_INCREF(str);
7929 return (PyObject*) str;
7930 }
Tim Peters8f422462000-09-09 06:13:41 +00007931
7932 /* ensure # of chars needed doesn't overflow int and # of bytes
7933 * needed doesn't overflow size_t
7934 */
7935 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007936 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007937 PyErr_SetString(PyExc_OverflowError,
7938 "repeated string is too long");
7939 return NULL;
7940 }
7941 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7942 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7943 PyErr_SetString(PyExc_OverflowError,
7944 "repeated string is too long");
7945 return NULL;
7946 }
7947 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 if (!u)
7949 return NULL;
7950
7951 p = u->str;
7952
Georg Brandl222de0f2009-04-12 12:01:50 +00007953 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954 Py_UNICODE_FILL(p, str->str[0], len);
7955 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007956 Py_ssize_t done = str->length; /* number of characters copied this far */
7957 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007959 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960 Py_UNICODE_COPY(p+done, p, n);
7961 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 }
7964
7965 return (PyObject*) u;
7966}
7967
7968PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 PyObject *subobj,
7970 PyObject *replobj,
7971 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
7973 PyObject *self;
7974 PyObject *str1;
7975 PyObject *str2;
7976 PyObject *result;
7977
7978 self = PyUnicode_FromObject(obj);
7979 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 str1 = PyUnicode_FromObject(subobj);
7982 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 Py_DECREF(self);
7984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
7986 str2 = PyUnicode_FromObject(replobj);
7987 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_DECREF(self);
7989 Py_DECREF(str1);
7990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
Tim Petersced69f82003-09-16 20:30:58 +00007992 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 (PyUnicodeObject *)str1,
7994 (PyUnicodeObject *)str2,
7995 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 Py_DECREF(self);
7997 Py_DECREF(str1);
7998 Py_DECREF(str2);
7999 return result;
8000}
8001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008002PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004\n\
8005Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008006old replaced by new. If the optional argument count is\n\
8007given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
8009static PyObject*
8010unicode_replace(PyUnicodeObject *self, PyObject *args)
8011{
8012 PyUnicodeObject *str1;
8013 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 PyObject *result;
8016
Martin v. Löwis18e16552006-02-15 17:27:45 +00008017 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 return NULL;
8019 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8020 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008023 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 Py_DECREF(str1);
8025 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
8028 result = replace(self, str1, str2, maxcount);
8029
8030 Py_DECREF(str1);
8031 Py_DECREF(str2);
8032 return result;
8033}
8034
8035static
8036PyObject *unicode_repr(PyObject *unicode)
8037{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008038 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008039 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008040 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8041 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8042
8043 /* XXX(nnorwitz): rather than over-allocating, it would be
8044 better to choose a different scheme. Perhaps scan the
8045 first N-chars of the string and allocate based on that size.
8046 */
8047 /* Initial allocation is based on the longest-possible unichr
8048 escape.
8049
8050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8051 unichr, so in this case it's the longest unichr escape. In
8052 narrow (UTF-16) builds this is five chars per source unichr
8053 since there are two unichrs in the surrogate pair, so in narrow
8054 (UTF-16) builds it's not the longest unichr escape.
8055
8056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8057 so in the narrow (UTF-16) build case it's the longest unichr
8058 escape.
8059 */
8060
Walter Dörwald1ab83302007-05-18 17:15:44 +00008061 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008063#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008065#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008069 if (repr == NULL)
8070 return NULL;
8071
Walter Dörwald1ab83302007-05-18 17:15:44 +00008072 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008073
8074 /* Add quote */
8075 *p++ = (findchar(s, size, '\'') &&
8076 !findchar(s, size, '"')) ? '"' : '\'';
8077 while (size-- > 0) {
8078 Py_UNICODE ch = *s++;
8079
8080 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008081 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008082 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008083 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008084 continue;
8085 }
8086
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008088 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008089 *p++ = '\\';
8090 *p++ = 't';
8091 }
8092 else if (ch == '\n') {
8093 *p++ = '\\';
8094 *p++ = 'n';
8095 }
8096 else if (ch == '\r') {
8097 *p++ = '\\';
8098 *p++ = 'r';
8099 }
8100
8101 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008102 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008103 *p++ = '\\';
8104 *p++ = 'x';
8105 *p++ = hexdigits[(ch >> 4) & 0x000F];
8106 *p++ = hexdigits[ch & 0x000F];
8107 }
8108
Georg Brandl559e5d72008-06-11 18:37:52 +00008109 /* Copy ASCII characters as-is */
8110 else if (ch < 0x7F) {
8111 *p++ = ch;
8112 }
8113
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008115 else {
8116 Py_UCS4 ucs = ch;
8117
8118#ifndef Py_UNICODE_WIDE
8119 Py_UNICODE ch2 = 0;
8120 /* Get code point from surrogate pair */
8121 if (size > 0) {
8122 ch2 = *s;
8123 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008127 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008128 size--;
8129 }
8130 }
8131#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008133 (categories Z* and C* except ASCII space)
8134 */
8135 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8136 /* Map 8-bit characters to '\xhh' */
8137 if (ucs <= 0xff) {
8138 *p++ = '\\';
8139 *p++ = 'x';
8140 *p++ = hexdigits[(ch >> 4) & 0x000F];
8141 *p++ = hexdigits[ch & 0x000F];
8142 }
8143 /* Map 21-bit characters to '\U00xxxxxx' */
8144 else if (ucs >= 0x10000) {
8145 *p++ = '\\';
8146 *p++ = 'U';
8147 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8148 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8149 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8150 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8151 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8152 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8153 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8154 *p++ = hexdigits[ucs & 0x0000000F];
8155 }
8156 /* Map 16-bit characters to '\uxxxx' */
8157 else {
8158 *p++ = '\\';
8159 *p++ = 'u';
8160 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8161 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8162 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8163 *p++ = hexdigits[ucs & 0x000F];
8164 }
8165 }
8166 /* Copy characters as-is */
8167 else {
8168 *p++ = ch;
8169#ifndef Py_UNICODE_WIDE
8170 if (ucs >= 0x10000)
8171 *p++ = ch2;
8172#endif
8173 }
8174 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008175 }
8176 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008177 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008178
8179 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008180 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008181 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182}
8183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008184PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186\n\
8187Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008188such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189arguments start and end are interpreted as in slice notation.\n\
8190\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008191Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192
8193static PyObject *
8194unicode_rfind(PyUnicodeObject *self, PyObject *args)
8195{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008196 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008197 Py_ssize_t start;
8198 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200
Christian Heimes9cd17752007-11-18 19:35:23 +00008201 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203
Thomas Wouters477c8d52006-05-27 19:21:47 +00008204 result = stringlib_rfind_slice(
8205 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8206 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8207 start, end
8208 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
8210 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213}
8214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008215PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008218Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
8220static PyObject *
8221unicode_rindex(PyUnicodeObject *self, PyObject *args)
8222{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008223 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008224 Py_ssize_t start;
8225 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008226 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
Christian Heimes9cd17752007-11-18 19:35:23 +00008228 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 result = stringlib_rfind_slice(
8232 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8233 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8234 start, end
8235 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
8237 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 if (result < 0) {
8240 PyErr_SetString(PyExc_ValueError, "substring not found");
8241 return NULL;
8242 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008243 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244}
8245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008246PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008249Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008250done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
8252static PyObject *
8253unicode_rjust(PyUnicodeObject *self, PyObject *args)
8254{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008255 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008256 Py_UNICODE fillchar = ' ';
8257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008258 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 return NULL;
8260
Tim Peters7a29bd52001-09-12 03:03:31 +00008261 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 Py_INCREF(self);
8263 return (PyObject*) self;
8264 }
8265
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008266 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267}
8268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 PyObject *sep,
8271 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272{
8273 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 s = PyUnicode_FromObject(s);
8276 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008277 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 if (sep != NULL) {
8279 sep = PyUnicode_FromObject(sep);
8280 if (sep == NULL) {
8281 Py_DECREF(s);
8282 return NULL;
8283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
8285
8286 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8287
8288 Py_DECREF(s);
8289 Py_XDECREF(sep);
8290 return result;
8291}
8292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008293PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295\n\
8296Return a list of the words in S, using sep as the\n\
8297delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008298splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008299whitespace string is a separator and empty strings are\n\
8300removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302static PyObject*
8303unicode_split(PyUnicodeObject *self, PyObject *args)
8304{
8305 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008306 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307
Martin v. Löwis18e16552006-02-15 17:27:45 +00008308 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 return NULL;
8310
8311 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317}
8318
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319PyObject *
8320PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8321{
8322 PyObject* str_obj;
8323 PyObject* sep_obj;
8324 PyObject* out;
8325
8326 str_obj = PyUnicode_FromObject(str_in);
8327 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008329 sep_obj = PyUnicode_FromObject(sep_in);
8330 if (!sep_obj) {
8331 Py_DECREF(str_obj);
8332 return NULL;
8333 }
8334
8335 out = stringlib_partition(
8336 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8337 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8338 );
8339
8340 Py_DECREF(sep_obj);
8341 Py_DECREF(str_obj);
8342
8343 return out;
8344}
8345
8346
8347PyObject *
8348PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8349{
8350 PyObject* str_obj;
8351 PyObject* sep_obj;
8352 PyObject* out;
8353
8354 str_obj = PyUnicode_FromObject(str_in);
8355 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008357 sep_obj = PyUnicode_FromObject(sep_in);
8358 if (!sep_obj) {
8359 Py_DECREF(str_obj);
8360 return NULL;
8361 }
8362
8363 out = stringlib_rpartition(
8364 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8365 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8366 );
8367
8368 Py_DECREF(sep_obj);
8369 Py_DECREF(str_obj);
8370
8371 return out;
8372}
8373
8374PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008376\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008377Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008379found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008380
8381static PyObject*
8382unicode_partition(PyUnicodeObject *self, PyObject *separator)
8383{
8384 return PyUnicode_Partition((PyObject *)self, separator);
8385}
8386
8387PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008390Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008391the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008392separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008393
8394static PyObject*
8395unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8396{
8397 return PyUnicode_RPartition((PyObject *)self, separator);
8398}
8399
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008400PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 PyObject *sep,
8402 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008403{
8404 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008406 s = PyUnicode_FromObject(s);
8407 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 if (sep != NULL) {
8410 sep = PyUnicode_FromObject(sep);
8411 if (sep == NULL) {
8412 Py_DECREF(s);
8413 return NULL;
8414 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008415 }
8416
8417 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8418
8419 Py_DECREF(s);
8420 Py_XDECREF(sep);
8421 return result;
8422}
8423
8424PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008426\n\
8427Return a list of the words in S, using sep as the\n\
8428delimiter string, starting at the end of the string and\n\
8429working to the front. If maxsplit is given, at most maxsplit\n\
8430splits are done. If sep is not specified, any whitespace string\n\
8431is a separator.");
8432
8433static PyObject*
8434unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8435{
8436 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008438
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008440 return NULL;
8441
8442 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008444 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008446 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008448}
8449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008450PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452\n\
8453Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008454Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008455is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456
8457static PyObject*
8458unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8459{
Guido van Rossum86662912000-04-11 15:38:46 +00008460 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461
Guido van Rossum86662912000-04-11 15:38:46 +00008462 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 return NULL;
8464
Guido van Rossum86662912000-04-11 15:38:46 +00008465 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466}
8467
8468static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008469PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Walter Dörwald346737f2007-05-31 10:44:43 +00008471 if (PyUnicode_CheckExact(self)) {
8472 Py_INCREF(self);
8473 return self;
8474 } else
8475 /* Subtype -- return genuine unicode string with the same value. */
8476 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8477 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478}
8479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008480PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482\n\
8483Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008484and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485
8486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008487unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 return fixup(self, fixswapcase);
8490}
8491
Georg Brandlceee0772007-11-27 23:48:05 +00008492PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008494\n\
8495Return a translation table usable for str.translate().\n\
8496If there is only one argument, it must be a dictionary mapping Unicode\n\
8497ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008498Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008499If there are two arguments, they must be strings of equal length, and\n\
8500in the resulting dictionary, each character in x will be mapped to the\n\
8501character at the same position in y. If there is a third argument, it\n\
8502must be a string, whose characters will be mapped to None in the result.");
8503
8504static PyObject*
8505unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8506{
8507 PyObject *x, *y = NULL, *z = NULL;
8508 PyObject *new = NULL, *key, *value;
8509 Py_ssize_t i = 0;
8510 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511
Georg Brandlceee0772007-11-27 23:48:05 +00008512 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8513 return NULL;
8514 new = PyDict_New();
8515 if (!new)
8516 return NULL;
8517 if (y != NULL) {
8518 /* x must be a string too, of equal length */
8519 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8520 if (!PyUnicode_Check(x)) {
8521 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8522 "be a string if there is a second argument");
8523 goto err;
8524 }
8525 if (PyUnicode_GET_SIZE(x) != ylen) {
8526 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8527 "arguments must have equal length");
8528 goto err;
8529 }
8530 /* create entries for translating chars in x to those in y */
8531 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008532 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8533 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008534 if (!key || !value)
8535 goto err;
8536 res = PyDict_SetItem(new, key, value);
8537 Py_DECREF(key);
8538 Py_DECREF(value);
8539 if (res < 0)
8540 goto err;
8541 }
8542 /* create entries for deleting chars in z */
8543 if (z != NULL) {
8544 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008545 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008546 if (!key)
8547 goto err;
8548 res = PyDict_SetItem(new, key, Py_None);
8549 Py_DECREF(key);
8550 if (res < 0)
8551 goto err;
8552 }
8553 }
8554 } else {
8555 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008556 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008557 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8558 "to maketrans it must be a dict");
8559 goto err;
8560 }
8561 /* copy entries into the new dict, converting string keys to int keys */
8562 while (PyDict_Next(x, &i, &key, &value)) {
8563 if (PyUnicode_Check(key)) {
8564 /* convert string keys to integer keys */
8565 PyObject *newkey;
8566 if (PyUnicode_GET_SIZE(key) != 1) {
8567 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8568 "table must be of length 1");
8569 goto err;
8570 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008571 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008572 if (!newkey)
8573 goto err;
8574 res = PyDict_SetItem(new, newkey, value);
8575 Py_DECREF(newkey);
8576 if (res < 0)
8577 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008578 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008579 /* just keep integer keys */
8580 if (PyDict_SetItem(new, key, value) < 0)
8581 goto err;
8582 } else {
8583 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8584 "be strings or integers");
8585 goto err;
8586 }
8587 }
8588 }
8589 return new;
8590 err:
8591 Py_DECREF(new);
8592 return NULL;
8593}
8594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008595PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597\n\
8598Return a copy of the string S, where all characters have been mapped\n\
8599through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008600Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008601Unmapped characters are left untouched. Characters mapped to None\n\
8602are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603
8604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008605unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606{
Georg Brandlceee0772007-11-27 23:48:05 +00008607 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608}
8609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008610PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008613Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614
8615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008616unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 return fixup(self, fixupper);
8619}
8620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008621PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008624Pad a numeric string S with zeros on the left, to fill a field\n\
8625of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626
8627static PyObject *
8628unicode_zfill(PyUnicodeObject *self, PyObject *args)
8629{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008630 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 PyUnicodeObject *u;
8632
Martin v. Löwis18e16552006-02-15 17:27:45 +00008633 Py_ssize_t width;
8634 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 return NULL;
8636
8637 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008638 if (PyUnicode_CheckExact(self)) {
8639 Py_INCREF(self);
8640 return (PyObject*) self;
8641 }
8642 else
8643 return PyUnicode_FromUnicode(
8644 PyUnicode_AS_UNICODE(self),
8645 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 }
8648
8649 fill = width - self->length;
8650
8651 u = pad(self, fill, 0, '0');
8652
Walter Dörwald068325e2002-04-15 13:36:47 +00008653 if (u == NULL)
8654 return NULL;
8655
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 if (u->str[fill] == '+' || u->str[fill] == '-') {
8657 /* move sign to beginning of string */
8658 u->str[0] = u->str[fill];
8659 u->str[fill] = '0';
8660 }
8661
8662 return (PyObject*) u;
8663}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
8665#if 0
8666static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008667unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Christian Heimes2202f872008-02-06 14:31:34 +00008669 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670}
8671#endif
8672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008673PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008676Return True if S starts with the specified prefix, False otherwise.\n\
8677With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008678With optional end, stop comparing S at that position.\n\
8679prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680
8681static PyObject *
8682unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008685 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008687 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008688 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008689 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008691 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8693 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008694 if (PyTuple_Check(subobj)) {
8695 Py_ssize_t i;
8696 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8697 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008699 if (substring == NULL)
8700 return NULL;
8701 result = tailmatch(self, substring, start, end, -1);
8702 Py_DECREF(substring);
8703 if (result) {
8704 Py_RETURN_TRUE;
8705 }
8706 }
8707 /* nothing matched */
8708 Py_RETURN_FALSE;
8709 }
8710 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008713 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008715 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716}
8717
8718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008719PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008722Return True if S ends with the specified suffix, False otherwise.\n\
8723With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724With optional end, stop comparing S at that position.\n\
8725suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726
8727static PyObject *
8728unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008731 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008733 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008734 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008735 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008737 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8739 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008740 if (PyTuple_Check(subobj)) {
8741 Py_ssize_t i;
8742 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8743 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008745 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008747 result = tailmatch(self, substring, start, end, +1);
8748 Py_DECREF(substring);
8749 if (result) {
8750 Py_RETURN_TRUE;
8751 }
8752 }
8753 Py_RETURN_FALSE;
8754 }
8755 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008759 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008761 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762}
8763
Eric Smith8c663262007-08-25 02:26:07 +00008764#include "stringlib/string_format.h"
8765
8766PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008768\n\
8769");
8770
Eric Smith4a7d76d2008-05-30 18:10:19 +00008771static PyObject *
8772unicode__format__(PyObject* self, PyObject* args)
8773{
8774 PyObject *format_spec;
8775
8776 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8777 return NULL;
8778
8779 return _PyUnicode_FormatAdvanced(self,
8780 PyUnicode_AS_UNICODE(format_spec),
8781 PyUnicode_GET_SIZE(format_spec));
8782}
8783
Eric Smith8c663262007-08-25 02:26:07 +00008784PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008786\n\
8787");
8788
8789static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008790unicode__sizeof__(PyUnicodeObject *v)
8791{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008792 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8793 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008794}
8795
8796PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008798
8799static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008800unicode_getnewargs(PyUnicodeObject *v)
8801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008803}
8804
8805
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806static PyMethodDef unicode_methods[] = {
8807
8808 /* Order is according to common usage: often used methods should
8809 appear first, since lookup is done sequentially. */
8810
Benjamin Peterson308d6372009-09-18 21:42:35 +00008811 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008812 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8813 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008814 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008815 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8816 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8817 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8818 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8819 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8820 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8821 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008822 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008823 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8824 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8825 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008826 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008827 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8828 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8829 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008830 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008831 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008832 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008833 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008834 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8835 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8836 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8837 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8838 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8839 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8840 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8841 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8842 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8843 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8844 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8845 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8846 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8847 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008848 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008849 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008850 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008851 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008852 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008853 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8854 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008855 {"maketrans", (PyCFunction) unicode_maketrans,
8856 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008857 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008858#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008859 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860#endif
8861
8862#if 0
8863 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008864 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865#endif
8866
Benjamin Peterson14339b62009-01-31 16:36:08 +00008867 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 {NULL, NULL}
8869};
8870
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008871static PyObject *
8872unicode_mod(PyObject *v, PyObject *w)
8873{
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 if (!PyUnicode_Check(v)) {
8875 Py_INCREF(Py_NotImplemented);
8876 return Py_NotImplemented;
8877 }
8878 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008879}
8880
8881static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008882 0, /*nb_add*/
8883 0, /*nb_subtract*/
8884 0, /*nb_multiply*/
8885 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008886};
8887
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 (lenfunc) unicode_length, /* sq_length */
8890 PyUnicode_Concat, /* sq_concat */
8891 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8892 (ssizeargfunc) unicode_getitem, /* sq_item */
8893 0, /* sq_slice */
8894 0, /* sq_ass_item */
8895 0, /* sq_ass_slice */
8896 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897};
8898
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008899static PyObject*
8900unicode_subscript(PyUnicodeObject* self, PyObject* item)
8901{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008902 if (PyIndex_Check(item)) {
8903 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008904 if (i == -1 && PyErr_Occurred())
8905 return NULL;
8906 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008907 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008908 return unicode_getitem(self, i);
8909 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008910 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008911 Py_UNICODE* source_buf;
8912 Py_UNICODE* result_buf;
8913 PyObject* result;
8914
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008915 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008917 return NULL;
8918 }
8919
8920 if (slicelength <= 0) {
8921 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008922 } else if (start == 0 && step == 1 && slicelength == self->length &&
8923 PyUnicode_CheckExact(self)) {
8924 Py_INCREF(self);
8925 return (PyObject *)self;
8926 } else if (step == 1) {
8927 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008928 } else {
8929 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008930 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8931 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 if (result_buf == NULL)
8934 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008935
8936 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8937 result_buf[i] = source_buf[cur];
8938 }
Tim Petersced69f82003-09-16 20:30:58 +00008939
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008940 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008941 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008942 return result;
8943 }
8944 } else {
8945 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8946 return NULL;
8947 }
8948}
8949
8950static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008951 (lenfunc)unicode_length, /* mp_length */
8952 (binaryfunc)unicode_subscript, /* mp_subscript */
8953 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008954};
8955
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957/* Helpers for PyUnicode_Format() */
8958
8959static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008960getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008962 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 (*p_argidx)++;
8965 if (arglen < 0)
8966 return args;
8967 else
8968 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
8970 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 return NULL;
8973}
8974
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008975/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008977static PyObject *
8978formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008980 char *p;
8981 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 x = PyFloat_AsDouble(v);
8985 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008986 return NULL;
8987
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008990
Eric Smith0923d1d2009-04-16 20:16:10 +00008991 p = PyOS_double_to_string(x, type, prec,
8992 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008993 if (p == NULL)
8994 return NULL;
8995 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008996 PyMem_Free(p);
8997 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998}
8999
Tim Peters38fd5b62000-09-21 05:43:11 +00009000static PyObject*
9001formatlong(PyObject *val, int flags, int prec, int type)
9002{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009003 char *buf;
9004 int len;
9005 PyObject *str; /* temporary string object. */
9006 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009007
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9009 if (!str)
9010 return NULL;
9011 result = PyUnicode_FromStringAndSize(buf, len);
9012 Py_DECREF(str);
9013 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009014}
9015
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016static int
9017formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009018 size_t buflen,
9019 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009021 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009022 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 if (PyUnicode_GET_SIZE(v) == 1) {
9024 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9025 buf[1] = '\0';
9026 return 1;
9027 }
9028#ifndef Py_UNICODE_WIDE
9029 if (PyUnicode_GET_SIZE(v) == 2) {
9030 /* Decode a valid surrogate pair */
9031 int c0 = PyUnicode_AS_UNICODE(v)[0];
9032 int c1 = PyUnicode_AS_UNICODE(v)[1];
9033 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9034 0xDC00 <= c1 && c1 <= 0xDFFF) {
9035 buf[0] = c0;
9036 buf[1] = c1;
9037 buf[2] = '\0';
9038 return 2;
9039 }
9040 }
9041#endif
9042 goto onError;
9043 }
9044 else {
9045 /* Integer input truncated to a character */
9046 long x;
9047 x = PyLong_AsLong(v);
9048 if (x == -1 && PyErr_Occurred())
9049 goto onError;
9050
9051 if (x < 0 || x > 0x10ffff) {
9052 PyErr_SetString(PyExc_OverflowError,
9053 "%c arg not in range(0x110000)");
9054 return -1;
9055 }
9056
9057#ifndef Py_UNICODE_WIDE
9058 if (x > 0xffff) {
9059 x -= 0x10000;
9060 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9061 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9062 return 2;
9063 }
9064#endif
9065 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009066 buf[1] = '\0';
9067 return 1;
9068 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009069
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009071 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009073 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074}
9075
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009076/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009077 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009078*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009079#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009080
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083{
9084 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009085 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 int args_owned = 0;
9087 PyUnicodeObject *result = NULL;
9088 PyObject *dict = NULL;
9089 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009090
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 PyErr_BadInternalCall();
9093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 }
9095 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009096 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 fmt = PyUnicode_AS_UNICODE(uformat);
9099 fmtcnt = PyUnicode_GET_SIZE(uformat);
9100
9101 reslen = rescnt = fmtcnt + 100;
9102 result = _PyUnicode_New(reslen);
9103 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 res = PyUnicode_AS_UNICODE(result);
9106
9107 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 arglen = PyTuple_Size(args);
9109 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
9111 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 arglen = -1;
9113 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009115 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009116 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118
9119 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 if (*fmt != '%') {
9121 if (--rescnt < 0) {
9122 rescnt = fmtcnt + 100;
9123 reslen += rescnt;
9124 if (_PyUnicode_Resize(&result, reslen) < 0)
9125 goto onError;
9126 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9127 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 }
9131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 /* Got a format specifier */
9133 int flags = 0;
9134 Py_ssize_t width = -1;
9135 int prec = -1;
9136 Py_UNICODE c = '\0';
9137 Py_UNICODE fill;
9138 int isnumok;
9139 PyObject *v = NULL;
9140 PyObject *temp = NULL;
9141 Py_UNICODE *pbuf;
9142 Py_UNICODE sign;
9143 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009144 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 fmt++;
9147 if (*fmt == '(') {
9148 Py_UNICODE *keystart;
9149 Py_ssize_t keylen;
9150 PyObject *key;
9151 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009152
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 if (dict == NULL) {
9154 PyErr_SetString(PyExc_TypeError,
9155 "format requires a mapping");
9156 goto onError;
9157 }
9158 ++fmt;
9159 --fmtcnt;
9160 keystart = fmt;
9161 /* Skip over balanced parentheses */
9162 while (pcount > 0 && --fmtcnt >= 0) {
9163 if (*fmt == ')')
9164 --pcount;
9165 else if (*fmt == '(')
9166 ++pcount;
9167 fmt++;
9168 }
9169 keylen = fmt - keystart - 1;
9170 if (fmtcnt < 0 || pcount > 0) {
9171 PyErr_SetString(PyExc_ValueError,
9172 "incomplete format key");
9173 goto onError;
9174 }
9175#if 0
9176 /* keys are converted to strings using UTF-8 and
9177 then looked up since Python uses strings to hold
9178 variables names etc. in its namespaces and we
9179 wouldn't want to break common idioms. */
9180 key = PyUnicode_EncodeUTF8(keystart,
9181 keylen,
9182 NULL);
9183#else
9184 key = PyUnicode_FromUnicode(keystart, keylen);
9185#endif
9186 if (key == NULL)
9187 goto onError;
9188 if (args_owned) {
9189 Py_DECREF(args);
9190 args_owned = 0;
9191 }
9192 args = PyObject_GetItem(dict, key);
9193 Py_DECREF(key);
9194 if (args == NULL) {
9195 goto onError;
9196 }
9197 args_owned = 1;
9198 arglen = -1;
9199 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 while (--fmtcnt >= 0) {
9202 switch (c = *fmt++) {
9203 case '-': flags |= F_LJUST; continue;
9204 case '+': flags |= F_SIGN; continue;
9205 case ' ': flags |= F_BLANK; continue;
9206 case '#': flags |= F_ALT; continue;
9207 case '0': flags |= F_ZERO; continue;
9208 }
9209 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009210 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009211 if (c == '*') {
9212 v = getnextarg(args, arglen, &argidx);
9213 if (v == NULL)
9214 goto onError;
9215 if (!PyLong_Check(v)) {
9216 PyErr_SetString(PyExc_TypeError,
9217 "* wants int");
9218 goto onError;
9219 }
9220 width = PyLong_AsLong(v);
9221 if (width == -1 && PyErr_Occurred())
9222 goto onError;
9223 if (width < 0) {
9224 flags |= F_LJUST;
9225 width = -width;
9226 }
9227 if (--fmtcnt >= 0)
9228 c = *fmt++;
9229 }
9230 else if (c >= '0' && c <= '9') {
9231 width = c - '0';
9232 while (--fmtcnt >= 0) {
9233 c = *fmt++;
9234 if (c < '0' || c > '9')
9235 break;
9236 if ((width*10) / 10 != width) {
9237 PyErr_SetString(PyExc_ValueError,
9238 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009239 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 }
9241 width = width*10 + (c - '0');
9242 }
9243 }
9244 if (c == '.') {
9245 prec = 0;
9246 if (--fmtcnt >= 0)
9247 c = *fmt++;
9248 if (c == '*') {
9249 v = getnextarg(args, arglen, &argidx);
9250 if (v == NULL)
9251 goto onError;
9252 if (!PyLong_Check(v)) {
9253 PyErr_SetString(PyExc_TypeError,
9254 "* wants int");
9255 goto onError;
9256 }
9257 prec = PyLong_AsLong(v);
9258 if (prec == -1 && PyErr_Occurred())
9259 goto onError;
9260 if (prec < 0)
9261 prec = 0;
9262 if (--fmtcnt >= 0)
9263 c = *fmt++;
9264 }
9265 else if (c >= '0' && c <= '9') {
9266 prec = c - '0';
9267 while (--fmtcnt >= 0) {
9268 c = Py_CHARMASK(*fmt++);
9269 if (c < '0' || c > '9')
9270 break;
9271 if ((prec*10) / 10 != prec) {
9272 PyErr_SetString(PyExc_ValueError,
9273 "prec too big");
9274 goto onError;
9275 }
9276 prec = prec*10 + (c - '0');
9277 }
9278 }
9279 } /* prec */
9280 if (fmtcnt >= 0) {
9281 if (c == 'h' || c == 'l' || c == 'L') {
9282 if (--fmtcnt >= 0)
9283 c = *fmt++;
9284 }
9285 }
9286 if (fmtcnt < 0) {
9287 PyErr_SetString(PyExc_ValueError,
9288 "incomplete format");
9289 goto onError;
9290 }
9291 if (c != '%') {
9292 v = getnextarg(args, arglen, &argidx);
9293 if (v == NULL)
9294 goto onError;
9295 }
9296 sign = 0;
9297 fill = ' ';
9298 switch (c) {
9299
9300 case '%':
9301 pbuf = formatbuf;
9302 /* presume that buffer length is at least 1 */
9303 pbuf[0] = '%';
9304 len = 1;
9305 break;
9306
9307 case 's':
9308 case 'r':
9309 case 'a':
9310 if (PyUnicode_Check(v) && c == 's') {
9311 temp = v;
9312 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009313 }
9314 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 if (c == 's')
9316 temp = PyObject_Str(v);
9317 else if (c == 'r')
9318 temp = PyObject_Repr(v);
9319 else
9320 temp = PyObject_ASCII(v);
9321 if (temp == NULL)
9322 goto onError;
9323 if (PyUnicode_Check(temp))
9324 /* nothing to do */;
9325 else {
9326 Py_DECREF(temp);
9327 PyErr_SetString(PyExc_TypeError,
9328 "%s argument has non-string str()");
9329 goto onError;
9330 }
9331 }
9332 pbuf = PyUnicode_AS_UNICODE(temp);
9333 len = PyUnicode_GET_SIZE(temp);
9334 if (prec >= 0 && len > prec)
9335 len = prec;
9336 break;
9337
9338 case 'i':
9339 case 'd':
9340 case 'u':
9341 case 'o':
9342 case 'x':
9343 case 'X':
9344 if (c == 'i')
9345 c = 'd';
9346 isnumok = 0;
9347 if (PyNumber_Check(v)) {
9348 PyObject *iobj=NULL;
9349
9350 if (PyLong_Check(v)) {
9351 iobj = v;
9352 Py_INCREF(iobj);
9353 }
9354 else {
9355 iobj = PyNumber_Long(v);
9356 }
9357 if (iobj!=NULL) {
9358 if (PyLong_Check(iobj)) {
9359 isnumok = 1;
9360 temp = formatlong(iobj, flags, prec, c);
9361 Py_DECREF(iobj);
9362 if (!temp)
9363 goto onError;
9364 pbuf = PyUnicode_AS_UNICODE(temp);
9365 len = PyUnicode_GET_SIZE(temp);
9366 sign = 1;
9367 }
9368 else {
9369 Py_DECREF(iobj);
9370 }
9371 }
9372 }
9373 if (!isnumok) {
9374 PyErr_Format(PyExc_TypeError,
9375 "%%%c format: a number is required, "
9376 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9377 goto onError;
9378 }
9379 if (flags & F_ZERO)
9380 fill = '0';
9381 break;
9382
9383 case 'e':
9384 case 'E':
9385 case 'f':
9386 case 'F':
9387 case 'g':
9388 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009389 temp = formatfloat(v, flags, prec, c);
9390 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009392 pbuf = PyUnicode_AS_UNICODE(temp);
9393 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 sign = 1;
9395 if (flags & F_ZERO)
9396 fill = '0';
9397 break;
9398
9399 case 'c':
9400 pbuf = formatbuf;
9401 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9402 if (len < 0)
9403 goto onError;
9404 break;
9405
9406 default:
9407 PyErr_Format(PyExc_ValueError,
9408 "unsupported format character '%c' (0x%x) "
9409 "at index %zd",
9410 (31<=c && c<=126) ? (char)c : '?',
9411 (int)c,
9412 (Py_ssize_t)(fmt - 1 -
9413 PyUnicode_AS_UNICODE(uformat)));
9414 goto onError;
9415 }
9416 if (sign) {
9417 if (*pbuf == '-' || *pbuf == '+') {
9418 sign = *pbuf++;
9419 len--;
9420 }
9421 else if (flags & F_SIGN)
9422 sign = '+';
9423 else if (flags & F_BLANK)
9424 sign = ' ';
9425 else
9426 sign = 0;
9427 }
9428 if (width < len)
9429 width = len;
9430 if (rescnt - (sign != 0) < width) {
9431 reslen -= rescnt;
9432 rescnt = width + fmtcnt + 100;
9433 reslen += rescnt;
9434 if (reslen < 0) {
9435 Py_XDECREF(temp);
9436 PyErr_NoMemory();
9437 goto onError;
9438 }
9439 if (_PyUnicode_Resize(&result, reslen) < 0) {
9440 Py_XDECREF(temp);
9441 goto onError;
9442 }
9443 res = PyUnicode_AS_UNICODE(result)
9444 + reslen - rescnt;
9445 }
9446 if (sign) {
9447 if (fill != ' ')
9448 *res++ = sign;
9449 rescnt--;
9450 if (width > len)
9451 width--;
9452 }
9453 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9454 assert(pbuf[0] == '0');
9455 assert(pbuf[1] == c);
9456 if (fill != ' ') {
9457 *res++ = *pbuf++;
9458 *res++ = *pbuf++;
9459 }
9460 rescnt -= 2;
9461 width -= 2;
9462 if (width < 0)
9463 width = 0;
9464 len -= 2;
9465 }
9466 if (width > len && !(flags & F_LJUST)) {
9467 do {
9468 --rescnt;
9469 *res++ = fill;
9470 } while (--width > len);
9471 }
9472 if (fill == ' ') {
9473 if (sign)
9474 *res++ = sign;
9475 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9476 assert(pbuf[0] == '0');
9477 assert(pbuf[1] == c);
9478 *res++ = *pbuf++;
9479 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009480 }
9481 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 Py_UNICODE_COPY(res, pbuf, len);
9483 res += len;
9484 rescnt -= len;
9485 while (--width >= len) {
9486 --rescnt;
9487 *res++ = ' ';
9488 }
9489 if (dict && (argidx < arglen) && c != '%') {
9490 PyErr_SetString(PyExc_TypeError,
9491 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009492 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 goto onError;
9494 }
9495 Py_XDECREF(temp);
9496 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 } /* until end */
9498 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 PyErr_SetString(PyExc_TypeError,
9500 "not all arguments converted during string formatting");
9501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
9503
Thomas Woutersa96affe2006-03-12 00:29:36 +00009504 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
9509 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 return (PyObject *)result;
9511
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 Py_XDECREF(result);
9514 Py_DECREF(uformat);
9515 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517 }
9518 return NULL;
9519}
9520
Jeremy Hylton938ace62002-07-17 16:30:39 +00009521static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009522unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9523
Tim Peters6d6c1a32001-08-02 04:15:00 +00009524static PyObject *
9525unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9526{
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009528 static char *kwlist[] = {"object", "encoding", "errors", 0};
9529 char *encoding = NULL;
9530 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009531
Benjamin Peterson14339b62009-01-31 16:36:08 +00009532 if (type != &PyUnicode_Type)
9533 return unicode_subtype_new(type, args, kwds);
9534 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009536 return NULL;
9537 if (x == NULL)
9538 return (PyObject *)_PyUnicode_New(0);
9539 if (encoding == NULL && errors == NULL)
9540 return PyObject_Str(x);
9541 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009543}
9544
Guido van Rossume023fe02001-08-30 03:12:59 +00009545static PyObject *
9546unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9547{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009548 PyUnicodeObject *tmp, *pnew;
9549 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009550
Benjamin Peterson14339b62009-01-31 16:36:08 +00009551 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9552 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9553 if (tmp == NULL)
9554 return NULL;
9555 assert(PyUnicode_Check(tmp));
9556 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9557 if (pnew == NULL) {
9558 Py_DECREF(tmp);
9559 return NULL;
9560 }
9561 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9562 if (pnew->str == NULL) {
9563 _Py_ForgetReference((PyObject *)pnew);
9564 PyObject_Del(pnew);
9565 Py_DECREF(tmp);
9566 return PyErr_NoMemory();
9567 }
9568 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9569 pnew->length = n;
9570 pnew->hash = tmp->hash;
9571 Py_DECREF(tmp);
9572 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009573}
9574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009575PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009577\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009578Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009579encoding defaults to the current default string encoding.\n\
9580errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009581
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009582static PyObject *unicode_iter(PyObject *seq);
9583
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009585 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009586 "str", /* tp_name */
9587 sizeof(PyUnicodeObject), /* tp_size */
9588 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009590 (destructor)unicode_dealloc, /* tp_dealloc */
9591 0, /* tp_print */
9592 0, /* tp_getattr */
9593 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009594 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009595 unicode_repr, /* tp_repr */
9596 &unicode_as_number, /* tp_as_number */
9597 &unicode_as_sequence, /* tp_as_sequence */
9598 &unicode_as_mapping, /* tp_as_mapping */
9599 (hashfunc) unicode_hash, /* tp_hash*/
9600 0, /* tp_call*/
9601 (reprfunc) unicode_str, /* tp_str */
9602 PyObject_GenericGetAttr, /* tp_getattro */
9603 0, /* tp_setattro */
9604 0, /* tp_as_buffer */
9605 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009607 unicode_doc, /* tp_doc */
9608 0, /* tp_traverse */
9609 0, /* tp_clear */
9610 PyUnicode_RichCompare, /* tp_richcompare */
9611 0, /* tp_weaklistoffset */
9612 unicode_iter, /* tp_iter */
9613 0, /* tp_iternext */
9614 unicode_methods, /* tp_methods */
9615 0, /* tp_members */
9616 0, /* tp_getset */
9617 &PyBaseObject_Type, /* tp_base */
9618 0, /* tp_dict */
9619 0, /* tp_descr_get */
9620 0, /* tp_descr_set */
9621 0, /* tp_dictoffset */
9622 0, /* tp_init */
9623 0, /* tp_alloc */
9624 unicode_new, /* tp_new */
9625 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626};
9627
9628/* Initialize the Unicode implementation */
9629
Thomas Wouters78890102000-07-22 19:25:51 +00009630void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009632 int i;
9633
Thomas Wouters477c8d52006-05-27 19:21:47 +00009634 /* XXX - move this array to unicodectype.c ? */
9635 Py_UNICODE linebreak[] = {
9636 0x000A, /* LINE FEED */
9637 0x000D, /* CARRIAGE RETURN */
9638 0x001C, /* FILE SEPARATOR */
9639 0x001D, /* GROUP SEPARATOR */
9640 0x001E, /* RECORD SEPARATOR */
9641 0x0085, /* NEXT LINE */
9642 0x2028, /* LINE SEPARATOR */
9643 0x2029, /* PARAGRAPH SEPARATOR */
9644 };
9645
Fred Drakee4315f52000-05-09 19:53:39 +00009646 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009647 free_list = NULL;
9648 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009650 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009651 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009652
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009653 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009655 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009657
9658 /* initialize the linebreak bloom filter */
9659 bloom_linebreak = make_bloom_mask(
9660 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9661 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009662
9663 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664}
9665
9666/* Finalize the Unicode implementation */
9667
Christian Heimesa156e092008-02-16 07:38:31 +00009668int
9669PyUnicode_ClearFreeList(void)
9670{
9671 int freelist_size = numfree;
9672 PyUnicodeObject *u;
9673
9674 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 PyUnicodeObject *v = u;
9676 u = *(PyUnicodeObject **)u;
9677 if (v->str)
9678 PyObject_DEL(v->str);
9679 Py_XDECREF(v->defenc);
9680 PyObject_Del(v);
9681 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009682 }
9683 free_list = NULL;
9684 assert(numfree == 0);
9685 return freelist_size;
9686}
9687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688void
Thomas Wouters78890102000-07-22 19:25:51 +00009689_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009691 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009693 Py_XDECREF(unicode_empty);
9694 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009695
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009696 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 if (unicode_latin1[i]) {
9698 Py_DECREF(unicode_latin1[i]);
9699 unicode_latin1[i] = NULL;
9700 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009701 }
Christian Heimesa156e092008-02-16 07:38:31 +00009702 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009704
Walter Dörwald16807132007-05-25 13:52:07 +00009705void
9706PyUnicode_InternInPlace(PyObject **p)
9707{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9709 PyObject *t;
9710 if (s == NULL || !PyUnicode_Check(s))
9711 Py_FatalError(
9712 "PyUnicode_InternInPlace: unicode strings only please!");
9713 /* If it's a subclass, we don't really know what putting
9714 it in the interned dict might do. */
9715 if (!PyUnicode_CheckExact(s))
9716 return;
9717 if (PyUnicode_CHECK_INTERNED(s))
9718 return;
9719 if (interned == NULL) {
9720 interned = PyDict_New();
9721 if (interned == NULL) {
9722 PyErr_Clear(); /* Don't leave an exception */
9723 return;
9724 }
9725 }
9726 /* It might be that the GetItem call fails even
9727 though the key is present in the dictionary,
9728 namely when this happens during a stack overflow. */
9729 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009731 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009732
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 if (t) {
9734 Py_INCREF(t);
9735 Py_DECREF(*p);
9736 *p = t;
9737 return;
9738 }
Walter Dörwald16807132007-05-25 13:52:07 +00009739
Benjamin Peterson14339b62009-01-31 16:36:08 +00009740 PyThreadState_GET()->recursion_critical = 1;
9741 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9742 PyErr_Clear();
9743 PyThreadState_GET()->recursion_critical = 0;
9744 return;
9745 }
9746 PyThreadState_GET()->recursion_critical = 0;
9747 /* The two references in interned are not counted by refcnt.
9748 The deallocator will take care of this */
9749 Py_REFCNT(s) -= 2;
9750 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009751}
9752
9753void
9754PyUnicode_InternImmortal(PyObject **p)
9755{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 PyUnicode_InternInPlace(p);
9757 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9758 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9759 Py_INCREF(*p);
9760 }
Walter Dörwald16807132007-05-25 13:52:07 +00009761}
9762
9763PyObject *
9764PyUnicode_InternFromString(const char *cp)
9765{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 PyObject *s = PyUnicode_FromString(cp);
9767 if (s == NULL)
9768 return NULL;
9769 PyUnicode_InternInPlace(&s);
9770 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009771}
9772
9773void _Py_ReleaseInternedUnicodeStrings(void)
9774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009775 PyObject *keys;
9776 PyUnicodeObject *s;
9777 Py_ssize_t i, n;
9778 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009779
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 if (interned == NULL || !PyDict_Check(interned))
9781 return;
9782 keys = PyDict_Keys(interned);
9783 if (keys == NULL || !PyList_Check(keys)) {
9784 PyErr_Clear();
9785 return;
9786 }
Walter Dörwald16807132007-05-25 13:52:07 +00009787
Benjamin Peterson14339b62009-01-31 16:36:08 +00009788 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9789 detector, interned unicode strings are not forcibly deallocated;
9790 rather, we give them their stolen references back, and then clear
9791 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009792
Benjamin Peterson14339b62009-01-31 16:36:08 +00009793 n = PyList_GET_SIZE(keys);
9794 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009796 for (i = 0; i < n; i++) {
9797 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9798 switch (s->state) {
9799 case SSTATE_NOT_INTERNED:
9800 /* XXX Shouldn't happen */
9801 break;
9802 case SSTATE_INTERNED_IMMORTAL:
9803 Py_REFCNT(s) += 1;
9804 immortal_size += s->length;
9805 break;
9806 case SSTATE_INTERNED_MORTAL:
9807 Py_REFCNT(s) += 2;
9808 mortal_size += s->length;
9809 break;
9810 default:
9811 Py_FatalError("Inconsistent interned string state.");
9812 }
9813 s->state = SSTATE_NOT_INTERNED;
9814 }
9815 fprintf(stderr, "total size of all interned strings: "
9816 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9817 "mortal/immortal\n", mortal_size, immortal_size);
9818 Py_DECREF(keys);
9819 PyDict_Clear(interned);
9820 Py_DECREF(interned);
9821 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009822}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009823
9824
9825/********************* Unicode Iterator **************************/
9826
9827typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 PyObject_HEAD
9829 Py_ssize_t it_index;
9830 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009831} unicodeiterobject;
9832
9833static void
9834unicodeiter_dealloc(unicodeiterobject *it)
9835{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009836 _PyObject_GC_UNTRACK(it);
9837 Py_XDECREF(it->it_seq);
9838 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009839}
9840
9841static int
9842unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9843{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844 Py_VISIT(it->it_seq);
9845 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009846}
9847
9848static PyObject *
9849unicodeiter_next(unicodeiterobject *it)
9850{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 PyUnicodeObject *seq;
9852 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009853
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 assert(it != NULL);
9855 seq = it->it_seq;
9856 if (seq == NULL)
9857 return NULL;
9858 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009859
Benjamin Peterson14339b62009-01-31 16:36:08 +00009860 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9861 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 if (item != NULL)
9864 ++it->it_index;
9865 return item;
9866 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009867
Benjamin Peterson14339b62009-01-31 16:36:08 +00009868 Py_DECREF(seq);
9869 it->it_seq = NULL;
9870 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009871}
9872
9873static PyObject *
9874unicodeiter_len(unicodeiterobject *it)
9875{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876 Py_ssize_t len = 0;
9877 if (it->it_seq)
9878 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9879 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009880}
9881
9882PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9883
9884static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009887 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009888};
9889
9890PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9892 "str_iterator", /* tp_name */
9893 sizeof(unicodeiterobject), /* tp_basicsize */
9894 0, /* tp_itemsize */
9895 /* methods */
9896 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9897 0, /* tp_print */
9898 0, /* tp_getattr */
9899 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009900 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009901 0, /* tp_repr */
9902 0, /* tp_as_number */
9903 0, /* tp_as_sequence */
9904 0, /* tp_as_mapping */
9905 0, /* tp_hash */
9906 0, /* tp_call */
9907 0, /* tp_str */
9908 PyObject_GenericGetAttr, /* tp_getattro */
9909 0, /* tp_setattro */
9910 0, /* tp_as_buffer */
9911 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9912 0, /* tp_doc */
9913 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9914 0, /* tp_clear */
9915 0, /* tp_richcompare */
9916 0, /* tp_weaklistoffset */
9917 PyObject_SelfIter, /* tp_iter */
9918 (iternextfunc)unicodeiter_next, /* tp_iternext */
9919 unicodeiter_methods, /* tp_methods */
9920 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009921};
9922
9923static PyObject *
9924unicode_iter(PyObject *seq)
9925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009927
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 if (!PyUnicode_Check(seq)) {
9929 PyErr_BadInternalCall();
9930 return NULL;
9931 }
9932 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9933 if (it == NULL)
9934 return NULL;
9935 it->it_index = 0;
9936 Py_INCREF(seq);
9937 it->it_seq = (PyUnicodeObject *)seq;
9938 _PyObject_GC_TRACK(it);
9939 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009940}
9941
Martin v. Löwis5b222132007-06-10 09:51:05 +00009942size_t
9943Py_UNICODE_strlen(const Py_UNICODE *u)
9944{
9945 int res = 0;
9946 while(*u++)
9947 res++;
9948 return res;
9949}
9950
9951Py_UNICODE*
9952Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9953{
9954 Py_UNICODE *u = s1;
9955 while ((*u++ = *s2++));
9956 return s1;
9957}
9958
9959Py_UNICODE*
9960Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9961{
9962 Py_UNICODE *u = s1;
9963 while ((*u++ = *s2++))
9964 if (n-- == 0)
9965 break;
9966 return s1;
9967}
9968
9969int
9970Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9971{
9972 while (*s1 && *s2 && *s1 == *s2)
9973 s1++, s2++;
9974 if (*s1 && *s2)
9975 return (*s1 < *s2) ? -1 : +1;
9976 if (*s1)
9977 return 1;
9978 if (*s2)
9979 return -1;
9980 return 0;
9981}
9982
9983Py_UNICODE*
9984Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9985{
9986 const Py_UNICODE *p;
9987 for (p = s; *p; p++)
9988 if (*p == c)
9989 return (Py_UNICODE*)p;
9990 return NULL;
9991}
9992
9993
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009994#ifdef __cplusplus
9995}
9996#endif
9997
9998
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009999/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 Local variables:
10001 c-basic-offset: 4
10002 indent-tabs-mode: nil
10003 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010004*/