blob: 9c0be9b23ca7569fd6ec3e965b9d2504a7c3e424 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000670makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
671 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000672{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000673 *fmt++ = '%';
674 if (width) {
675 if (zeropad)
676 *fmt++ = '0';
677 fmt += sprintf(fmt, "%d", width);
678 }
679 if (precision)
680 fmt += sprintf(fmt, ".%d", precision);
681 if (longflag)
682 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000683 else if (longlongflag) {
684 /* longlongflag should only ever be nonzero on machines with
685 HAVE_LONG_LONG defined */
686#ifdef HAVE_LONG_LONG
687 char *f = PY_FORMAT_LONG_LONG;
688 while (*f)
689 *fmt++ = *f++;
690#else
691 /* we shouldn't ever get here */
692 assert(0);
693 *fmt++ = 'l';
694#endif
695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 else if (size_tflag) {
697 char *f = PY_FORMAT_SIZE_T;
698 while (*f)
699 *fmt++ = *f++;
700 }
701 *fmt++ = c;
702 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000703}
704
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
706
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000707/* size of fixed-size buffer for formatting single arguments */
708#define ITEM_BUFFER_LEN 21
709/* maximum number of characters required for output of %ld. 21 characters
710 allows for 64-bit integers (in decimal) and an optional sign. */
711#define MAX_LONG_CHARS 21
712/* maximum number of characters required for output of %lld.
713 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
714 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
715#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
716
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717PyObject *
718PyUnicode_FromFormatV(const char *format, va_list vargs)
719{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000720 va_list count;
721 Py_ssize_t callcount = 0;
722 PyObject **callresults = NULL;
723 PyObject **callresult = NULL;
724 Py_ssize_t n = 0;
725 int width = 0;
726 int precision = 0;
727 int zeropad;
728 const char* f;
729 Py_UNICODE *s;
730 PyObject *string;
731 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000732 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000733 /* use abuffer instead of buffer, if we need more space
734 * (which can happen if there's a format specifier with width). */
735 char *abuffer = NULL;
736 char *realbuffer;
737 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000738 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740
741#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000742 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000743#else
744#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000745 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748#endif
749#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000767 }
768 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000770 if (callcount) {
771 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
772 if (!callresults) {
773 PyErr_NoMemory();
774 return NULL;
775 }
776 callresult = callresults;
777 }
778 /* step 3: figure out how large a buffer we need */
779 for (f = format; *f; f++) {
780 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000781#ifdef HAVE_LONG_LONG
782 int longlongflag = 0;
783#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000784 const char* p = f;
785 width = 0;
786 while (ISDIGIT((unsigned)*f))
787 width = (width*10) + *f++ - '0';
788 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
789 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000790
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
792 * they don't affect the amount of space we reserve.
793 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794 if (*f == 'l') {
795 if (f[1] == 'd' || f[1] == 'u') {
796 ++f;
797 }
798#ifdef HAVE_LONG_LONG
799 else if (f[1] == 'l' &&
800 (f[2] == 'd' || f[2] == 'u')) {
801 longlongflag = 1;
802 f += 2;
803 }
804#endif
805 }
806 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000807 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000808 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 switch (*f) {
811 case 'c':
812 (void)va_arg(count, int);
813 /* fall through... */
814 case '%':
815 n++;
816 break;
817 case 'd': case 'u': case 'i': case 'x':
818 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819#ifdef HAVE_LONG_LONG
820 if (longlongflag) {
821 if (width < MAX_LONG_LONG_CHARS)
822 width = MAX_LONG_LONG_CHARS;
823 }
824 else
825#endif
826 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
827 including sign. Decimal takes the most space. This
828 isn't enough for octal. If a width is specified we
829 need more (which we allocate later). */
830 if (width < MAX_LONG_CHARS)
831 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000832 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000833 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000834 if (abuffersize < width)
835 abuffersize = width;
836 break;
837 case 's':
838 {
839 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000840 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000841 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
842 if (!str)
843 goto fail;
844 n += PyUnicode_GET_SIZE(str);
845 /* Remember the str and switch to the next slot */
846 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 break;
848 }
849 case 'U':
850 {
851 PyObject *obj = va_arg(count, PyObject *);
852 assert(obj && PyUnicode_Check(obj));
853 n += PyUnicode_GET_SIZE(obj);
854 break;
855 }
856 case 'V':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 const char *str = va_arg(count, const char *);
860 assert(obj || str);
861 assert(!obj || PyUnicode_Check(obj));
862 if (obj)
863 n += PyUnicode_GET_SIZE(obj);
864 else
865 n += strlen(str);
866 break;
867 }
868 case 'S':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 PyObject *str;
872 assert(obj);
873 str = PyObject_Str(obj);
874 if (!str)
875 goto fail;
876 n += PyUnicode_GET_SIZE(str);
877 /* Remember the str and switch to the next slot */
878 *callresult++ = str;
879 break;
880 }
881 case 'R':
882 {
883 PyObject *obj = va_arg(count, PyObject *);
884 PyObject *repr;
885 assert(obj);
886 repr = PyObject_Repr(obj);
887 if (!repr)
888 goto fail;
889 n += PyUnicode_GET_SIZE(repr);
890 /* Remember the repr and switch to the next slot */
891 *callresult++ = repr;
892 break;
893 }
894 case 'A':
895 {
896 PyObject *obj = va_arg(count, PyObject *);
897 PyObject *ascii;
898 assert(obj);
899 ascii = PyObject_ASCII(obj);
900 if (!ascii)
901 goto fail;
902 n += PyUnicode_GET_SIZE(ascii);
903 /* Remember the repr and switch to the next slot */
904 *callresult++ = ascii;
905 break;
906 }
907 case 'p':
908 (void) va_arg(count, int);
909 /* maximum 64-bit pointer representation:
910 * 0xffffffffffffffff
911 * so 19 characters is enough.
912 * XXX I count 18 -- what's the extra for?
913 */
914 n += 19;
915 break;
916 default:
917 /* if we stumble upon an unknown
918 formatting code, copy the rest of
919 the format string to the output
920 string. (we cannot just skip the
921 code, since there's no way to know
922 what's in the argument list) */
923 n += strlen(p);
924 goto expand;
925 }
926 } else
927 n++;
928 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000929 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000930 if (abuffersize > ITEM_BUFFER_LEN) {
931 /* add 1 for sprintf's trailing null byte */
932 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 if (!abuffer) {
934 PyErr_NoMemory();
935 goto fail;
936 }
937 realbuffer = abuffer;
938 }
939 else
940 realbuffer = buffer;
941 /* step 4: fill the buffer */
942 /* Since we've analyzed how much space we need for the worst case,
943 we don't have to resize the string.
944 There can be no errors beyond this point. */
945 string = PyUnicode_FromUnicode(NULL, n);
946 if (!string)
947 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000948
Benjamin Peterson14339b62009-01-31 16:36:08 +0000949 s = PyUnicode_AS_UNICODE(string);
950 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000951
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 for (f = format; *f; f++) {
953 if (*f == '%') {
954 const char* p = f++;
955 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000956 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000957 int size_tflag = 0;
958 zeropad = (*f == '0');
959 /* parse the width.precision part */
960 width = 0;
961 while (ISDIGIT((unsigned)*f))
962 width = (width*10) + *f++ - '0';
963 precision = 0;
964 if (*f == '.') {
965 f++;
966 while (ISDIGIT((unsigned)*f))
967 precision = (precision*10) + *f++ - '0';
968 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000969 /* Handle %ld, %lu, %lld and %llu. */
970 if (*f == 'l') {
971 if (f[1] == 'd' || f[1] == 'u') {
972 longflag = 1;
973 ++f;
974 }
975#ifdef HAVE_LONG_LONG
976 else if (f[1] == 'l' &&
977 (f[2] == 'd' || f[2] == 'u')) {
978 longlongflag = 1;
979 f += 2;
980 }
981#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 }
983 /* handle the size_t flag. */
984 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
985 size_tflag = 1;
986 ++f;
987 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000988
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 switch (*f) {
990 case 'c':
991 *s++ = va_arg(vargs, int);
992 break;
993 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000994 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
995 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 if (longflag)
997 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000998#ifdef HAVE_LONG_LONG
999 else if (longlongflag)
1000 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1001#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 else if (size_tflag)
1003 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1004 else
1005 sprintf(realbuffer, fmt, va_arg(vargs, int));
1006 appendstring(realbuffer);
1007 break;
1008 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001009 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1010 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 if (longflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013#ifdef HAVE_LONG_LONG
1014 else if (longlongflag)
1015 sprintf(realbuffer, fmt, va_arg(vargs,
1016 unsigned PY_LONG_LONG));
1017#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 else if (size_tflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1020 else
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1022 appendstring(realbuffer);
1023 break;
1024 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001025 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 sprintf(realbuffer, fmt, va_arg(vargs, int));
1027 appendstring(realbuffer);
1028 break;
1029 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001030 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001031 sprintf(realbuffer, fmt, va_arg(vargs, int));
1032 appendstring(realbuffer);
1033 break;
1034 case 's':
1035 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001036 /* unused, since we already have the result */
1037 (void) va_arg(vargs, char *);
1038 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1039 PyUnicode_GET_SIZE(*callresult));
1040 s += PyUnicode_GET_SIZE(*callresult);
1041 /* We're done with the unicode()/repr() => forget it */
1042 Py_DECREF(*callresult);
1043 /* switch to next unicode()/repr() result */
1044 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
1046 }
1047 case 'U':
1048 {
1049 PyObject *obj = va_arg(vargs, PyObject *);
1050 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1051 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1052 s += size;
1053 break;
1054 }
1055 case 'V':
1056 {
1057 PyObject *obj = va_arg(vargs, PyObject *);
1058 const char *str = va_arg(vargs, const char *);
1059 if (obj) {
1060 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1061 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1062 s += size;
1063 } else {
1064 appendstring(str);
1065 }
1066 break;
1067 }
1068 case 'S':
1069 case 'R':
1070 {
1071 Py_UNICODE *ucopy;
1072 Py_ssize_t usize;
1073 Py_ssize_t upos;
1074 /* unused, since we already have the result */
1075 (void) va_arg(vargs, PyObject *);
1076 ucopy = PyUnicode_AS_UNICODE(*callresult);
1077 usize = PyUnicode_GET_SIZE(*callresult);
1078 for (upos = 0; upos<usize;)
1079 *s++ = ucopy[upos++];
1080 /* We're done with the unicode()/repr() => forget it */
1081 Py_DECREF(*callresult);
1082 /* switch to next unicode()/repr() result */
1083 ++callresult;
1084 break;
1085 }
1086 case 'p':
1087 sprintf(buffer, "%p", va_arg(vargs, void*));
1088 /* %p is ill-defined: ensure leading 0x. */
1089 if (buffer[1] == 'X')
1090 buffer[1] = 'x';
1091 else if (buffer[1] != 'x') {
1092 memmove(buffer+2, buffer, strlen(buffer)+1);
1093 buffer[0] = '0';
1094 buffer[1] = 'x';
1095 }
1096 appendstring(buffer);
1097 break;
1098 case '%':
1099 *s++ = '%';
1100 break;
1101 default:
1102 appendstring(p);
1103 goto end;
1104 }
1105 } else
1106 *s++ = *f;
1107 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001108
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 if (callresults)
1111 PyObject_Free(callresults);
1112 if (abuffer)
1113 PyObject_Free(abuffer);
1114 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1115 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001116 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001117 if (callresults) {
1118 PyObject **callresult2 = callresults;
1119 while (callresult2 < callresult) {
1120 Py_DECREF(*callresult2);
1121 ++callresult2;
1122 }
1123 PyObject_Free(callresults);
1124 }
1125 if (abuffer)
1126 PyObject_Free(abuffer);
1127 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001128}
1129
1130#undef appendstring
1131
1132PyObject *
1133PyUnicode_FromFormat(const char *format, ...)
1134{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001135 PyObject* ret;
1136 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137
1138#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001139 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001140#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001141 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001142#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 ret = PyUnicode_FromFormatV(format, vargs);
1144 va_end(vargs);
1145 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146}
1147
Martin v. Löwis18e16552006-02-15 17:27:45 +00001148Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 wchar_t *w,
1150 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151{
1152 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 PyErr_BadInternalCall();
1154 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001156
1157 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001159 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161#ifdef HAVE_USABLE_WCHAR_T
1162 memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 register Py_UNICODE *u;
1166 register Py_ssize_t i;
1167 u = PyUnicode_AS_UNICODE(unicode);
1168 for (i = size; i > 0; i--)
1169 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
1171#endif
1172
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001173 if (size > PyUnicode_GET_SIZE(unicode))
1174 return PyUnicode_GET_SIZE(unicode);
1175 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001176 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177}
1178
1179#endif
1180
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001181PyObject *PyUnicode_FromOrdinal(int ordinal)
1182{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001183 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001184
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001185 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 PyErr_SetString(PyExc_ValueError,
1187 "chr() arg not in range(0x110000)");
1188 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001189 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001190
1191#ifndef Py_UNICODE_WIDE
1192 if (ordinal > 0xffff) {
1193 ordinal -= 0x10000;
1194 s[0] = 0xD800 | (ordinal >> 10);
1195 s[1] = 0xDC00 | (ordinal & 0x3FF);
1196 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001197 }
1198#endif
1199
Hye-Shik Chang40574832004-04-06 07:24:51 +00001200 s[0] = (Py_UNICODE)ordinal;
1201 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001202}
1203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204PyObject *PyUnicode_FromObject(register PyObject *obj)
1205{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001206 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001207 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001208 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001209 Py_INCREF(obj);
1210 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001211 }
1212 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001213 /* For a Unicode subtype that's not a Unicode object,
1214 return a true Unicode object with the same data. */
1215 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1216 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001218 PyErr_Format(PyExc_TypeError,
1219 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001220 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001221 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001222}
1223
1224PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001225 const char *encoding,
1226 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001227{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001228 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001231
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001233 PyErr_BadInternalCall();
1234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001236
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001237 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_SetString(PyExc_TypeError,
1239 "decoding str is not supported");
1240 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001241 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001242
1243 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001244 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001245 s = PyBytes_AS_STRING(obj);
1246 len = PyBytes_GET_SIZE(obj);
1247 }
1248 else if (PyByteArray_Check(obj)) {
1249 s = PyByteArray_AS_STRING(obj);
1250 len = PyByteArray_GET_SIZE(obj);
1251 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001252 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 /* Overwrite the error message with something more useful in
1254 case of a TypeError. */
1255 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001256 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 "coercing to str: need string or buffer, "
1258 "%.80s found",
1259 Py_TYPE(obj)->tp_name);
1260 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001261 }
Tim Petersced69f82003-09-16 20:30:58 +00001262
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001263 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001265 Py_INCREF(unicode_empty);
1266 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 }
Tim Petersced69f82003-09-16 20:30:58 +00001268 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001269 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001270
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001271 return v;
1272
Benjamin Peterson29060642009-01-31 22:14:21 +00001273 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
1277PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 Py_ssize_t size,
1279 const char *encoding,
1280 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281{
1282 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001283 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001284 char lower[20]; /* Enough for any encoding name we recognize */
1285 char *l;
1286 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287
1288 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001289 encoding = PyUnicode_GetDefaultEncoding();
1290
1291 /* Convert encoding to lower case and replace '_' with '-' in order to
1292 catch e.g. UTF_8 */
1293 e = encoding;
1294 l = lower;
1295 while (*e && l < &lower[(sizeof lower) - 2]) {
1296 if (ISUPPER(*e)) {
1297 *l++ = TOLOWER(*e++);
1298 }
1299 else if (*e == '_') {
1300 *l++ = '-';
1301 e++;
1302 }
1303 else {
1304 *l++ = *e++;
1305 }
1306 }
1307 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001308
1309 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001310 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001312 else if ((strcmp(lower, "latin-1") == 0) ||
1313 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001314 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001315#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001316 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001317 return PyUnicode_DecodeMBCS(s, size, errors);
1318#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001319 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001320 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001321 else if (strcmp(lower, "utf-16") == 0)
1322 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1323 else if (strcmp(lower, "utf-32") == 0)
1324 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325
1326 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001327 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001328 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001329 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001330 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (buffer == NULL)
1332 goto onError;
1333 unicode = PyCodec_Decode(buffer, encoding, errors);
1334 if (unicode == NULL)
1335 goto onError;
1336 if (!PyUnicode_Check(unicode)) {
1337 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001338 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001339 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 Py_DECREF(unicode);
1341 goto onError;
1342 }
1343 Py_DECREF(buffer);
1344 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Benjamin Peterson29060642009-01-31 22:14:21 +00001346 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347 Py_XDECREF(buffer);
1348 return NULL;
1349}
1350
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001351PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1352 const char *encoding,
1353 const char *errors)
1354{
1355 PyObject *v;
1356
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361
1362 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001363 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001364
1365 /* Decode via the codec registry */
1366 v = PyCodec_Decode(unicode, encoding, errors);
1367 if (v == NULL)
1368 goto onError;
1369 return v;
1370
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001372 return NULL;
1373}
1374
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001375PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1376 const char *encoding,
1377 const char *errors)
1378{
1379 PyObject *v;
1380
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385
1386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001388
1389 /* Decode via the codec registry */
1390 v = PyCodec_Decode(unicode, encoding, errors);
1391 if (v == NULL)
1392 goto onError;
1393 if (!PyUnicode_Check(v)) {
1394 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001395 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 Py_TYPE(v)->tp_name);
1397 Py_DECREF(v);
1398 goto onError;
1399 }
1400 return v;
1401
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001403 return NULL;
1404}
1405
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001407 Py_ssize_t size,
1408 const char *encoding,
1409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
1411 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001412
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413 unicode = PyUnicode_FromUnicode(s, size);
1414 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1417 Py_DECREF(unicode);
1418 return v;
1419}
1420
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001421PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1422 const char *encoding,
1423 const char *errors)
1424{
1425 PyObject *v;
1426
1427 if (!PyUnicode_Check(unicode)) {
1428 PyErr_BadArgument();
1429 goto onError;
1430 }
1431
1432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001434
1435 /* Encode via the codec registry */
1436 v = PyCodec_Encode(unicode, encoding, errors);
1437 if (v == NULL)
1438 goto onError;
1439 return v;
1440
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001442 return NULL;
1443}
1444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1446 const char *encoding,
1447 const char *errors)
1448{
1449 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001450
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 if (!PyUnicode_Check(unicode)) {
1452 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454 }
Fred Drakee4315f52000-05-09 19:53:39 +00001455
Tim Petersced69f82003-09-16 20:30:58 +00001456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001458
1459 /* Shortcuts for common default encodings */
1460 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001461 if (strcmp(encoding, "utf-8") == 0)
1462 return PyUnicode_AsUTF8String(unicode);
1463 else if (strcmp(encoding, "latin-1") == 0)
1464 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001465#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001466 else if (strcmp(encoding, "mbcs") == 0)
1467 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001468#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001469 else if (strcmp(encoding, "ascii") == 0)
1470 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001471 /* During bootstrap, we may need to find the encodings
1472 package, to load the file system encoding, and require the
1473 file system encoding in order to load the encodings
1474 package.
1475
1476 Break out of this dependency by assuming that the path to
1477 the encodings module is ASCII-only. XXX could try wcstombs
1478 instead, if the file system encoding is the locale's
1479 encoding. */
1480 else if (Py_FileSystemDefaultEncoding &&
1481 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1482 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485
1486 /* Encode via the codec registry */
1487 v = PyCodec_Encode(unicode, encoding, errors);
1488 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001489 return NULL;
1490
1491 /* The normal path */
1492 if (PyBytes_Check(v))
1493 return v;
1494
1495 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001496 if (PyByteArray_Check(v)) {
1497 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001498 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001499 PyOS_snprintf(msg, sizeof(msg),
1500 "encoder %s returned buffer instead of bytes",
1501 encoding);
1502 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001503 Py_DECREF(v);
1504 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001505 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001506
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001507 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1508 Py_DECREF(v);
1509 return b;
1510 }
1511
1512 PyErr_Format(PyExc_TypeError,
1513 "encoder did not return a bytes object (type=%.400s)",
1514 Py_TYPE(v)->tp_name);
1515 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516 return NULL;
1517}
1518
1519PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1520 const char *encoding,
1521 const char *errors)
1522{
1523 PyObject *v;
1524
1525 if (!PyUnicode_Check(unicode)) {
1526 PyErr_BadArgument();
1527 goto onError;
1528 }
1529
1530 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001532
1533 /* Encode via the codec registry */
1534 v = PyCodec_Encode(unicode, encoding, errors);
1535 if (v == NULL)
1536 goto onError;
1537 if (!PyUnicode_Check(v)) {
1538 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001539 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001540 Py_TYPE(v)->tp_name);
1541 Py_DECREF(v);
1542 goto onError;
1543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001545
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 return NULL;
1548}
1549
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001550PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001551 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001552{
1553 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001554 if (v)
1555 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001556 if (errors != NULL)
1557 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001558 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001559 PyUnicode_GET_SIZE(unicode),
1560 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001561 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001562 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001563 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001564 return v;
1565}
1566
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001567PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001568PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001569 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001570 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1571}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001572
Christian Heimes5894ba72007-11-04 11:43:14 +00001573PyObject*
1574PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1575{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001576 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1577 can be undefined. If it is case, decode using UTF-8. The following assumes
1578 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1579 bootstrapping process where the codecs aren't ready yet.
1580 */
1581 if (Py_FileSystemDefaultEncoding) {
1582#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001583 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001584 return PyUnicode_DecodeMBCS(s, size, "replace");
1585 }
1586#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001587 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001588 return PyUnicode_DecodeUTF8(s, size, "replace");
1589 }
1590#endif
1591 return PyUnicode_Decode(s, size,
1592 Py_FileSystemDefaultEncoding,
1593 "replace");
1594 }
1595 else {
1596 return PyUnicode_DecodeUTF8(s, size, "replace");
1597 }
1598}
1599
Martin v. Löwis011e8422009-05-05 04:43:17 +00001600/* Convert the argument to a bytes object, according to the file
1601 system encoding */
1602
1603int
1604PyUnicode_FSConverter(PyObject* arg, void* addr)
1605{
1606 PyObject *output = NULL;
1607 Py_ssize_t size;
1608 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001609 if (arg == NULL) {
1610 Py_DECREF(*(PyObject**)addr);
1611 return 1;
1612 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001613 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1614 output = arg;
1615 Py_INCREF(output);
1616 }
1617 else {
1618 arg = PyUnicode_FromObject(arg);
1619 if (!arg)
1620 return 0;
1621 output = PyUnicode_AsEncodedObject(arg,
1622 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001623 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001624 Py_DECREF(arg);
1625 if (!output)
1626 return 0;
1627 if (!PyBytes_Check(output)) {
1628 Py_DECREF(output);
1629 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1630 return 0;
1631 }
1632 }
1633 if (PyBytes_Check(output)) {
1634 size = PyBytes_GET_SIZE(output);
1635 data = PyBytes_AS_STRING(output);
1636 }
1637 else {
1638 size = PyByteArray_GET_SIZE(output);
1639 data = PyByteArray_AS_STRING(output);
1640 }
1641 if (size != strlen(data)) {
1642 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1643 Py_DECREF(output);
1644 return 0;
1645 }
1646 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001647 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001648}
1649
1650
Martin v. Löwis5b222132007-06-10 09:51:05 +00001651char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001652_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001653{
Christian Heimesf3863112007-11-22 07:46:41 +00001654 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001655 if (!PyUnicode_Check(unicode)) {
1656 PyErr_BadArgument();
1657 return NULL;
1658 }
Christian Heimesf3863112007-11-22 07:46:41 +00001659 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1660 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001661 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001662 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001663 *psize = PyBytes_GET_SIZE(bytes);
1664 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001665}
1666
1667char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001668_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001669{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001670 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001671}
1672
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1674{
1675 if (!PyUnicode_Check(unicode)) {
1676 PyErr_BadArgument();
1677 goto onError;
1678 }
1679 return PyUnicode_AS_UNICODE(unicode);
1680
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 return NULL;
1683}
1684
Martin v. Löwis18e16552006-02-15 17:27:45 +00001685Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686{
1687 if (!PyUnicode_Check(unicode)) {
1688 PyErr_BadArgument();
1689 goto onError;
1690 }
1691 return PyUnicode_GET_SIZE(unicode);
1692
Benjamin Peterson29060642009-01-31 22:14:21 +00001693 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return -1;
1695}
1696
Thomas Wouters78890102000-07-22 19:25:51 +00001697const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001698{
1699 return unicode_default_encoding;
1700}
1701
1702int PyUnicode_SetDefaultEncoding(const char *encoding)
1703{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001704 if (strcmp(encoding, unicode_default_encoding) != 0) {
1705 PyErr_Format(PyExc_ValueError,
1706 "Can only set default encoding to %s",
1707 unicode_default_encoding);
1708 return -1;
1709 }
Fred Drakee4315f52000-05-09 19:53:39 +00001710 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001711}
1712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713/* error handling callback helper:
1714 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001715 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 and adjust various state variables.
1717 return 0 on success, -1 on error
1718*/
1719
1720static
1721int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 const char *encoding, const char *reason,
1723 const char **input, const char **inend, Py_ssize_t *startinpos,
1724 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1725 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001727 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728
1729 PyObject *restuple = NULL;
1730 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001731 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001732 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001733 Py_ssize_t requiredsize;
1734 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001736 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001737 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 int res = -1;
1739
1740 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001741 *errorHandler = PyCodec_LookupError(errors);
1742 if (*errorHandler == NULL)
1743 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 }
1745
1746 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001747 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001748 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1749 if (*exceptionObject == NULL)
1750 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 }
1752 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001753 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1754 goto onError;
1755 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1756 goto onError;
1757 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1758 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 }
1760
1761 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1762 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001763 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001765 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001766 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 }
1768 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001770
1771 /* Copy back the bytes variables, which might have been modified by the
1772 callback */
1773 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1774 if (!inputobj)
1775 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001776 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001778 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001779 *input = PyBytes_AS_STRING(inputobj);
1780 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001781 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001782 /* we can DECREF safely, as the exception has another reference,
1783 so the object won't go away. */
1784 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001788 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1790 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001791 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792
1793 /* need more space? (at least enough for what we
1794 have+the replacement+the rest of the string (starting
1795 at the new input position), so we won't have to check space
1796 when there are no errors in the rest of the string) */
1797 repptr = PyUnicode_AS_UNICODE(repunicode);
1798 repsize = PyUnicode_GET_SIZE(repunicode);
1799 requiredsize = *outpos + repsize + insize-newpos;
1800 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 if (requiredsize<2*outsize)
1802 requiredsize = 2*outsize;
1803 if (_PyUnicode_Resize(output, requiredsize) < 0)
1804 goto onError;
1805 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 }
1807 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001808 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 Py_UNICODE_COPY(*outptr, repptr, repsize);
1810 *outptr += repsize;
1811 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 /* we made it! */
1814 res = 0;
1815
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 Py_XDECREF(restuple);
1818 return res;
1819}
1820
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821/* --- UTF-7 Codec -------------------------------------------------------- */
1822
Antoine Pitrou244651a2009-05-04 18:56:13 +00001823/* See RFC2152 for details. We encode conservatively and decode liberally. */
1824
1825/* Three simple macros defining base-64. */
1826
1827/* Is c a base-64 character? */
1828
1829#define IS_BASE64(c) \
1830 (((c) >= 'A' && (c) <= 'Z') || \
1831 ((c) >= 'a' && (c) <= 'z') || \
1832 ((c) >= '0' && (c) <= '9') || \
1833 (c) == '+' || (c) == '/')
1834
1835/* given that c is a base-64 character, what is its base-64 value? */
1836
1837#define FROM_BASE64(c) \
1838 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1839 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1840 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1841 (c) == '+' ? 62 : 63)
1842
1843/* What is the base-64 character of the bottom 6 bits of n? */
1844
1845#define TO_BASE64(n) \
1846 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1847
1848/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1849 * decoded as itself. We are permissive on decoding; the only ASCII
1850 * byte not decoding to itself is the + which begins a base64
1851 * string. */
1852
1853#define DECODE_DIRECT(c) \
1854 ((c) <= 127 && (c) != '+')
1855
1856/* The UTF-7 encoder treats ASCII characters differently according to
1857 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1858 * the above). See RFC2152. This array identifies these different
1859 * sets:
1860 * 0 : "Set D"
1861 * alphanumeric and '(),-./:?
1862 * 1 : "Set O"
1863 * !"#$%&*;<=>@[]^_`{|}
1864 * 2 : "whitespace"
1865 * ht nl cr sp
1866 * 3 : special (must be base64 encoded)
1867 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1868 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001869
Tim Petersced69f82003-09-16 20:30:58 +00001870static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001871char utf7_category[128] = {
1872/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1873 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1874/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1875 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1876/* sp ! " # $ % & ' ( ) * + , - . / */
1877 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1878/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1880/* @ A B C D E F G H I J K L M N O */
1881 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1882/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1884/* ` a b c d e f g h i j k l m n o */
1885 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1886/* p q r s t u v w x y z { | } ~ del */
1887 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888};
1889
Antoine Pitrou244651a2009-05-04 18:56:13 +00001890/* ENCODE_DIRECT: this character should be encoded as itself. The
1891 * answer depends on whether we are encoding set O as itself, and also
1892 * on whether we are encoding whitespace as itself. RFC2152 makes it
1893 * clear that the answers to these questions vary between
1894 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001895
Antoine Pitrou244651a2009-05-04 18:56:13 +00001896#define ENCODE_DIRECT(c, directO, directWS) \
1897 ((c) < 128 && (c) > 0 && \
1898 ((utf7_category[(c)] == 0) || \
1899 (directWS && (utf7_category[(c)] == 2)) || \
1900 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001902PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001903 Py_ssize_t size,
1904 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001905{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001906 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1907}
1908
Antoine Pitrou244651a2009-05-04 18:56:13 +00001909/* The decoder. The only state we preserve is our read position,
1910 * i.e. how many characters we have consumed. So if we end in the
1911 * middle of a shift sequence we have to back off the read position
1912 * and the output to the beginning of the sequence, otherwise we lose
1913 * all the shift state (seen bits, number of bits seen, high
1914 * surrogate). */
1915
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001916PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 Py_ssize_t size,
1918 const char *errors,
1919 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001920{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001922 Py_ssize_t startinpos;
1923 Py_ssize_t endinpos;
1924 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001925 const char *e;
1926 PyUnicodeObject *unicode;
1927 Py_UNICODE *p;
1928 const char *errmsg = "";
1929 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001930 Py_UNICODE *shiftOutStart;
1931 unsigned int base64bits = 0;
1932 unsigned long base64buffer = 0;
1933 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 PyObject *errorHandler = NULL;
1935 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936
1937 unicode = _PyUnicode_New(size);
1938 if (!unicode)
1939 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001940 if (size == 0) {
1941 if (consumed)
1942 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001943 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001944 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945
1946 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001947 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001948 e = s + size;
1949
1950 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001951 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001952 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001953 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954
Antoine Pitrou244651a2009-05-04 18:56:13 +00001955 if (inShift) { /* in a base-64 section */
1956 if (IS_BASE64(ch)) { /* consume a base-64 character */
1957 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1958 base64bits += 6;
1959 s++;
1960 if (base64bits >= 16) {
1961 /* we have enough bits for a UTF-16 value */
1962 Py_UNICODE outCh = (Py_UNICODE)
1963 (base64buffer >> (base64bits-16));
1964 base64bits -= 16;
1965 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1966 if (surrogate) {
1967 /* expecting a second surrogate */
1968 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1969#ifdef Py_UNICODE_WIDE
1970 *p++ = (((surrogate & 0x3FF)<<10)
1971 | (outCh & 0x3FF)) + 0x10000;
1972#else
1973 *p++ = surrogate;
1974 *p++ = outCh;
1975#endif
1976 surrogate = 0;
1977 }
1978 else {
1979 surrogate = 0;
1980 errmsg = "second surrogate missing";
1981 goto utf7Error;
1982 }
1983 }
1984 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1985 /* first surrogate */
1986 surrogate = outCh;
1987 }
1988 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1989 errmsg = "unexpected second surrogate";
1990 goto utf7Error;
1991 }
1992 else {
1993 *p++ = outCh;
1994 }
1995 }
1996 }
1997 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001998 inShift = 0;
1999 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002000 if (surrogate) {
2001 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002002 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002003 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002004 if (base64bits > 0) { /* left-over bits */
2005 if (base64bits >= 6) {
2006 /* We've seen at least one base-64 character */
2007 errmsg = "partial character in shift sequence";
2008 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002009 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002010 else {
2011 /* Some bits remain; they should be zero */
2012 if (base64buffer != 0) {
2013 errmsg = "non-zero padding bits in shift sequence";
2014 goto utf7Error;
2015 }
2016 }
2017 }
2018 if (ch != '-') {
2019 /* '-' is absorbed; other terminating
2020 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 *p++ = ch;
2022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023 }
2024 }
2025 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002027 s++; /* consume '+' */
2028 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 s++;
2030 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002031 }
2032 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002033 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002034 shiftOutStart = p;
2035 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002036 }
2037 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002038 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002039 *p++ = ch;
2040 s++;
2041 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002042 else {
2043 startinpos = s-starts;
2044 s++;
2045 errmsg = "unexpected special character";
2046 goto utf7Error;
2047 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002049utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 outpos = p-PyUnicode_AS_UNICODE(unicode);
2051 endinpos = s-starts;
2052 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 errors, &errorHandler,
2054 "utf7", errmsg,
2055 &starts, &e, &startinpos, &endinpos, &exc, &s,
2056 &unicode, &outpos, &p))
2057 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 }
2059
Antoine Pitrou244651a2009-05-04 18:56:13 +00002060 /* end of string */
2061
2062 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2063 /* if we're in an inconsistent state, that's an error */
2064 if (surrogate ||
2065 (base64bits >= 6) ||
2066 (base64bits > 0 && base64buffer != 0)) {
2067 outpos = p-PyUnicode_AS_UNICODE(unicode);
2068 endinpos = size;
2069 if (unicode_decode_call_errorhandler(
2070 errors, &errorHandler,
2071 "utf7", "unterminated shift sequence",
2072 &starts, &e, &startinpos, &endinpos, &exc, &s,
2073 &unicode, &outpos, &p))
2074 goto onError;
2075 if (s < e)
2076 goto restart;
2077 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002079
2080 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002081 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 if (inShift) {
2083 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002084 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 }
2086 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002087 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002088 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002089 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002091 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002092 goto onError;
2093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 Py_XDECREF(errorHandler);
2095 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002096 return (PyObject *)unicode;
2097
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 Py_XDECREF(errorHandler);
2100 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002101 Py_DECREF(unicode);
2102 return NULL;
2103}
2104
2105
2106PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002107 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002108 int base64SetO,
2109 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002111{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002112 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002113 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002114 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002116 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002117 unsigned int base64bits = 0;
2118 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002119 char * out;
2120 char * start;
2121
2122 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002123 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002125 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002126 return PyErr_NoMemory();
2127
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 if (v == NULL)
2130 return NULL;
2131
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002132 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 for (;i < size; ++i) {
2134 Py_UNICODE ch = s[i];
2135
Antoine Pitrou244651a2009-05-04 18:56:13 +00002136 if (inShift) {
2137 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2138 /* shifting out */
2139 if (base64bits) { /* output remaining bits */
2140 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2141 base64buffer = 0;
2142 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 }
2144 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002145 /* Characters not in the BASE64 set implicitly unshift the sequence
2146 so no '-' is required, except if the character is itself a '-' */
2147 if (IS_BASE64(ch) || ch == '-') {
2148 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002150 *out++ = (char) ch;
2151 }
2152 else {
2153 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002154 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156 else { /* not in a shift sequence */
2157 if (ch == '+') {
2158 *out++ = '+';
2159 *out++ = '-';
2160 }
2161 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2162 *out++ = (char) ch;
2163 }
2164 else {
2165 *out++ = '+';
2166 inShift = 1;
2167 goto encode_char;
2168 }
2169 }
2170 continue;
2171encode_char:
2172#ifdef Py_UNICODE_WIDE
2173 if (ch >= 0x10000) {
2174 /* code first surrogate */
2175 base64bits += 16;
2176 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2177 while (base64bits >= 6) {
2178 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2179 base64bits -= 6;
2180 }
2181 /* prepare second surrogate */
2182 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2183 }
2184#endif
2185 base64bits += 16;
2186 base64buffer = (base64buffer << 16) | ch;
2187 while (base64bits >= 6) {
2188 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2189 base64bits -= 6;
2190 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002191 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192 if (base64bits)
2193 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2194 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002195 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002196 if (_PyBytes_Resize(&v, out - start) < 0)
2197 return NULL;
2198 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002199}
2200
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201#undef IS_BASE64
2202#undef FROM_BASE64
2203#undef TO_BASE64
2204#undef DECODE_DIRECT
2205#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207/* --- UTF-8 Codec -------------------------------------------------------- */
2208
Tim Petersced69f82003-09-16 20:30:58 +00002209static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210char utf8_code_length[256] = {
2211 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2212 illegal prefix. see RFC 2279 for details */
2213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2225 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2226 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2227 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2228 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2229};
2230
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002232 Py_ssize_t size,
2233 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234{
Walter Dörwald69652032004-09-07 20:24:22 +00002235 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2236}
2237
Antoine Pitrouab868312009-01-10 15:40:25 +00002238/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2239#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2240
2241/* Mask to quickly check whether a C 'long' contains a
2242 non-ASCII, UTF8-encoded char. */
2243#if (SIZEOF_LONG == 8)
2244# define ASCII_CHAR_MASK 0x8080808080808080L
2245#elif (SIZEOF_LONG == 4)
2246# define ASCII_CHAR_MASK 0x80808080L
2247#else
2248# error C 'long' size should be either 4 or 8!
2249#endif
2250
Walter Dörwald69652032004-09-07 20:24:22 +00002251PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 Py_ssize_t size,
2253 const char *errors,
2254 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002255{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002258 Py_ssize_t startinpos;
2259 Py_ssize_t endinpos;
2260 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002261 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 PyUnicodeObject *unicode;
2263 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002264 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 PyObject *errorHandler = NULL;
2266 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267
2268 /* Note: size will always be longer than the resulting Unicode
2269 character count */
2270 unicode = _PyUnicode_New(size);
2271 if (!unicode)
2272 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002273 if (size == 0) {
2274 if (consumed)
2275 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278
2279 /* Unpack UTF-8 encoded data */
2280 p = unicode->str;
2281 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002282 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283
2284 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002285 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286
2287 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002288 /* Fast path for runs of ASCII characters. Given that common UTF-8
2289 input will consist of an overwhelming majority of ASCII
2290 characters, we try to optimize for this case by checking
2291 as many characters as a C 'long' can contain.
2292 First, check if we can do an aligned read, as most CPUs have
2293 a penalty for unaligned reads.
2294 */
2295 if (!((size_t) s & LONG_PTR_MASK)) {
2296 /* Help register allocation */
2297 register const char *_s = s;
2298 register Py_UNICODE *_p = p;
2299 while (_s < aligned_end) {
2300 /* Read a whole long at a time (either 4 or 8 bytes),
2301 and do a fast unrolled copy if it only contains ASCII
2302 characters. */
2303 unsigned long data = *(unsigned long *) _s;
2304 if (data & ASCII_CHAR_MASK)
2305 break;
2306 _p[0] = (unsigned char) _s[0];
2307 _p[1] = (unsigned char) _s[1];
2308 _p[2] = (unsigned char) _s[2];
2309 _p[3] = (unsigned char) _s[3];
2310#if (SIZEOF_LONG == 8)
2311 _p[4] = (unsigned char) _s[4];
2312 _p[5] = (unsigned char) _s[5];
2313 _p[6] = (unsigned char) _s[6];
2314 _p[7] = (unsigned char) _s[7];
2315#endif
2316 _s += SIZEOF_LONG;
2317 _p += SIZEOF_LONG;
2318 }
2319 s = _s;
2320 p = _p;
2321 if (s == e)
2322 break;
2323 ch = (unsigned char)*s;
2324 }
2325 }
2326
2327 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002328 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 s++;
2330 continue;
2331 }
2332
2333 n = utf8_code_length[ch];
2334
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002335 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 if (consumed)
2337 break;
2338 else {
2339 errmsg = "unexpected end of data";
2340 startinpos = s-starts;
2341 endinpos = size;
2342 goto utf8Error;
2343 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345
2346 switch (n) {
2347
2348 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002349 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002350 startinpos = s-starts;
2351 endinpos = startinpos+1;
2352 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002355 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 startinpos = s-starts;
2357 endinpos = startinpos+1;
2358 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359
2360 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002361 if ((s[1] & 0xc0) != 0x80) {
2362 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 startinpos = s-starts;
2364 endinpos = startinpos+2;
2365 goto utf8Error;
2366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002368 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002369 startinpos = s-starts;
2370 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002371 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 goto utf8Error;
2373 }
2374 else
2375 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 break;
2377
2378 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002379 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002380 (s[2] & 0xc0) != 0x80) {
2381 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 startinpos = s-starts;
2383 endinpos = startinpos+3;
2384 goto utf8Error;
2385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002387 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002388 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 startinpos = s-starts;
2390 endinpos = startinpos+3;
2391 goto utf8Error;
2392 }
2393 else
2394 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002395 break;
2396
2397 case 4:
2398 if ((s[1] & 0xc0) != 0x80 ||
2399 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 (s[3] & 0xc0) != 0x80) {
2401 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 startinpos = s-starts;
2403 endinpos = startinpos+4;
2404 goto utf8Error;
2405 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002406 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002407 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002408 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002409 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002410 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002411 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 UTF-16 */
2413 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002414 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002415 startinpos = s-starts;
2416 endinpos = startinpos+4;
2417 goto utf8Error;
2418 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002419#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002421#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002422 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002423
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002424 /* translate from 10000..10FFFF to 0..FFFF */
2425 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002426
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002427 /* high surrogate = top 10 bits added to D800 */
2428 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002429
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002430 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002431 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002432#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433 break;
2434
2435 default:
2436 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002438 startinpos = s-starts;
2439 endinpos = startinpos+n;
2440 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 }
2442 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002443 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Benjamin Peterson29060642009-01-31 22:14:21 +00002445 utf8Error:
2446 outpos = p-PyUnicode_AS_UNICODE(unicode);
2447 if (unicode_decode_call_errorhandler(
2448 errors, &errorHandler,
2449 "utf8", errmsg,
2450 &starts, &e, &startinpos, &endinpos, &exc, &s,
2451 &unicode, &outpos, &p))
2452 goto onError;
2453 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 }
Walter Dörwald69652032004-09-07 20:24:22 +00002455 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002456 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457
2458 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002459 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 goto onError;
2461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 Py_XDECREF(errorHandler);
2463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 return (PyObject *)unicode;
2465
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 Py_XDECREF(errorHandler);
2468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 Py_DECREF(unicode);
2470 return NULL;
2471}
2472
Antoine Pitrouab868312009-01-10 15:40:25 +00002473#undef ASCII_CHAR_MASK
2474
2475
Tim Peters602f7402002-04-27 18:03:26 +00002476/* Allocation strategy: if the string is short, convert into a stack buffer
2477 and allocate exactly as much space needed at the end. Else allocate the
2478 maximum possible needed (4 result bytes per Unicode character), and return
2479 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002480*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002481PyObject *
2482PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002483 Py_ssize_t size,
2484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485{
Tim Peters602f7402002-04-27 18:03:26 +00002486#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002487
Guido van Rossum98297ee2007-11-06 21:34:58 +00002488 Py_ssize_t i; /* index into s of next input byte */
2489 PyObject *result; /* result string object */
2490 char *p; /* next free byte in output buffer */
2491 Py_ssize_t nallocated; /* number of result bytes allocated */
2492 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002493 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002494 PyObject *errorHandler = NULL;
2495 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002496
Tim Peters602f7402002-04-27 18:03:26 +00002497 assert(s != NULL);
2498 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
Tim Peters602f7402002-04-27 18:03:26 +00002500 if (size <= MAX_SHORT_UNICHARS) {
2501 /* Write into the stack buffer; nallocated can't overflow.
2502 * At the end, we'll allocate exactly as much heap space as it
2503 * turns out we need.
2504 */
2505 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002506 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002507 p = stackbuf;
2508 }
2509 else {
2510 /* Overallocate on the heap, and give the excess back at the end. */
2511 nallocated = size * 4;
2512 if (nallocated / 4 != size) /* overflow! */
2513 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002514 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002515 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002516 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002517 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002518 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002519
Tim Peters602f7402002-04-27 18:03:26 +00002520 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002521 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002522
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002523 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002524 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002528 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002529 *p++ = (char)(0xc0 | (ch >> 6));
2530 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002531 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002532 else {
Tim Peters602f7402002-04-27 18:03:26 +00002533 /* Encode UCS2 Unicode ordinals */
2534 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002535#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002536 /* Special case: check for high surrogate */
2537 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2538 Py_UCS4 ch2 = s[i];
2539 /* Check for low surrogate and combine the two to
2540 form a UCS4 value */
2541 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002542 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002543 i++;
2544 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002545 }
Tim Peters602f7402002-04-27 18:03:26 +00002546 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002547 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002548#endif
2549 if (ch >= 0xd800 && ch <= 0xdfff) {
2550 Py_ssize_t newpos;
2551 PyObject *rep;
2552 char *prep;
2553 int k;
2554 rep = unicode_encode_call_errorhandler
2555 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2556 s, size, &exc, i-1, i, &newpos);
2557 if (!rep)
2558 goto error;
2559 /* Implementation limitations: only support error handler that return
2560 bytes, and only support up to four replacement bytes. */
2561 if (!PyBytes_Check(rep)) {
2562 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2563 Py_DECREF(rep);
2564 goto error;
2565 }
2566 if (PyBytes_Size(rep) > 4) {
2567 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2568 Py_DECREF(rep);
2569 goto error;
2570 }
2571 prep = PyBytes_AsString(rep);
2572 for(k = PyBytes_Size(rep); k > 0; k--)
2573 *p++ = *prep++;
2574 Py_DECREF(rep);
2575 continue;
2576
2577 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002578 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002579 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2580 *p++ = (char)(0x80 | (ch & 0x3f));
2581 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002583#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002584 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002585#endif
Tim Peters602f7402002-04-27 18:03:26 +00002586 /* Encode UCS4 Unicode ordinals */
2587 *p++ = (char)(0xf0 | (ch >> 18));
2588 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2589 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2590 *p++ = (char)(0x80 | (ch & 0x3f));
2591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002593
Guido van Rossum98297ee2007-11-06 21:34:58 +00002594 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002595 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002596 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002597 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002598 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002599 }
2600 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002601 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002602 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002603 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002604 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002605 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002606 Py_XDECREF(errorHandler);
2607 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002608 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002609 error:
2610 Py_XDECREF(errorHandler);
2611 Py_XDECREF(exc);
2612 Py_XDECREF(result);
2613 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002614
Tim Peters602f7402002-04-27 18:03:26 +00002615#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616}
2617
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2619{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002624 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627}
2628
Walter Dörwald41980ca2007-08-16 21:55:45 +00002629/* --- UTF-32 Codec ------------------------------------------------------- */
2630
2631PyObject *
2632PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002633 Py_ssize_t size,
2634 const char *errors,
2635 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002636{
2637 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2638}
2639
2640PyObject *
2641PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002642 Py_ssize_t size,
2643 const char *errors,
2644 int *byteorder,
2645 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002646{
2647 const char *starts = s;
2648 Py_ssize_t startinpos;
2649 Py_ssize_t endinpos;
2650 Py_ssize_t outpos;
2651 PyUnicodeObject *unicode;
2652 Py_UNICODE *p;
2653#ifndef Py_UNICODE_WIDE
2654 int i, pairs;
2655#else
2656 const int pairs = 0;
2657#endif
2658 const unsigned char *q, *e;
2659 int bo = 0; /* assume native ordering by default */
2660 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002661 /* Offsets from q for retrieving bytes in the right order. */
2662#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2663 int iorder[] = {0, 1, 2, 3};
2664#else
2665 int iorder[] = {3, 2, 1, 0};
2666#endif
2667 PyObject *errorHandler = NULL;
2668 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002669 /* On narrow builds we split characters outside the BMP into two
2670 codepoints => count how much extra space we need. */
2671#ifndef Py_UNICODE_WIDE
2672 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002673 if (((Py_UCS4 *)s)[i] >= 0x10000)
2674 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002675#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002676
2677 /* This might be one to much, because of a BOM */
2678 unicode = _PyUnicode_New((size+3)/4+pairs);
2679 if (!unicode)
2680 return NULL;
2681 if (size == 0)
2682 return (PyObject *)unicode;
2683
2684 /* Unpack UTF-32 encoded data */
2685 p = unicode->str;
2686 q = (unsigned char *)s;
2687 e = q + size;
2688
2689 if (byteorder)
2690 bo = *byteorder;
2691
2692 /* Check for BOM marks (U+FEFF) in the input and adjust current
2693 byte order setting accordingly. In native mode, the leading BOM
2694 mark is skipped, in all other modes, it is copied to the output
2695 stream as-is (giving a ZWNBSP character). */
2696 if (bo == 0) {
2697 if (size >= 4) {
2698 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002700#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 if (bom == 0x0000FEFF) {
2702 q += 4;
2703 bo = -1;
2704 }
2705 else if (bom == 0xFFFE0000) {
2706 q += 4;
2707 bo = 1;
2708 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002709#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 if (bom == 0x0000FEFF) {
2711 q += 4;
2712 bo = 1;
2713 }
2714 else if (bom == 0xFFFE0000) {
2715 q += 4;
2716 bo = -1;
2717 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002718#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002720 }
2721
2722 if (bo == -1) {
2723 /* force LE */
2724 iorder[0] = 0;
2725 iorder[1] = 1;
2726 iorder[2] = 2;
2727 iorder[3] = 3;
2728 }
2729 else if (bo == 1) {
2730 /* force BE */
2731 iorder[0] = 3;
2732 iorder[1] = 2;
2733 iorder[2] = 1;
2734 iorder[3] = 0;
2735 }
2736
2737 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 Py_UCS4 ch;
2739 /* remaining bytes at the end? (size should be divisible by 4) */
2740 if (e-q<4) {
2741 if (consumed)
2742 break;
2743 errmsg = "truncated data";
2744 startinpos = ((const char *)q)-starts;
2745 endinpos = ((const char *)e)-starts;
2746 goto utf32Error;
2747 /* The remaining input chars are ignored if the callback
2748 chooses to skip the input */
2749 }
2750 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2751 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002752
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 if (ch >= 0x110000)
2754 {
2755 errmsg = "codepoint not in range(0x110000)";
2756 startinpos = ((const char *)q)-starts;
2757 endinpos = startinpos+4;
2758 goto utf32Error;
2759 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002760#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 if (ch >= 0x10000)
2762 {
2763 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2764 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2765 }
2766 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002767#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 *p++ = ch;
2769 q += 4;
2770 continue;
2771 utf32Error:
2772 outpos = p-PyUnicode_AS_UNICODE(unicode);
2773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "utf32", errmsg,
2776 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2777 &unicode, &outpos, &p))
2778 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779 }
2780
2781 if (byteorder)
2782 *byteorder = bo;
2783
2784 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002786
2787 /* Adjust length */
2788 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2789 goto onError;
2790
2791 Py_XDECREF(errorHandler);
2792 Py_XDECREF(exc);
2793 return (PyObject *)unicode;
2794
Benjamin Peterson29060642009-01-31 22:14:21 +00002795 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002796 Py_DECREF(unicode);
2797 Py_XDECREF(errorHandler);
2798 Py_XDECREF(exc);
2799 return NULL;
2800}
2801
2802PyObject *
2803PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 Py_ssize_t size,
2805 const char *errors,
2806 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002808 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002809 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002810 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002811#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002812 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002813#else
2814 const int pairs = 0;
2815#endif
2816 /* Offsets from p for storing byte pairs in the right order. */
2817#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2818 int iorder[] = {0, 1, 2, 3};
2819#else
2820 int iorder[] = {3, 2, 1, 0};
2821#endif
2822
Benjamin Peterson29060642009-01-31 22:14:21 +00002823#define STORECHAR(CH) \
2824 do { \
2825 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2826 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2827 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2828 p[iorder[0]] = (CH) & 0xff; \
2829 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002830 } while(0)
2831
2832 /* In narrow builds we can output surrogate pairs as one codepoint,
2833 so we need less space. */
2834#ifndef Py_UNICODE_WIDE
2835 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2837 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2838 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002839#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002840 nsize = (size - pairs + (byteorder == 0));
2841 bytesize = nsize * 4;
2842 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002844 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002845 if (v == NULL)
2846 return NULL;
2847
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002848 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002849 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002851 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002852 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002853
2854 if (byteorder == -1) {
2855 /* force LE */
2856 iorder[0] = 0;
2857 iorder[1] = 1;
2858 iorder[2] = 2;
2859 iorder[3] = 3;
2860 }
2861 else if (byteorder == 1) {
2862 /* force BE */
2863 iorder[0] = 3;
2864 iorder[1] = 2;
2865 iorder[2] = 1;
2866 iorder[3] = 0;
2867 }
2868
2869 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002870 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002871#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2873 Py_UCS4 ch2 = *s;
2874 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2875 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2876 s++;
2877 size--;
2878 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002879 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002880#endif
2881 STORECHAR(ch);
2882 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002883
2884 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002885 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002886#undef STORECHAR
2887}
2888
2889PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2890{
2891 if (!PyUnicode_Check(unicode)) {
2892 PyErr_BadArgument();
2893 return NULL;
2894 }
2895 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 PyUnicode_GET_SIZE(unicode),
2897 NULL,
2898 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899}
2900
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901/* --- UTF-16 Codec ------------------------------------------------------- */
2902
Tim Peters772747b2001-08-09 22:21:55 +00002903PyObject *
2904PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 Py_ssize_t size,
2906 const char *errors,
2907 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908{
Walter Dörwald69652032004-09-07 20:24:22 +00002909 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2910}
2911
Antoine Pitrouab868312009-01-10 15:40:25 +00002912/* Two masks for fast checking of whether a C 'long' may contain
2913 UTF16-encoded surrogate characters. This is an efficient heuristic,
2914 assuming that non-surrogate characters with a code point >= 0x8000 are
2915 rare in most input.
2916 FAST_CHAR_MASK is used when the input is in native byte ordering,
2917 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002918*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002919#if (SIZEOF_LONG == 8)
2920# define FAST_CHAR_MASK 0x8000800080008000L
2921# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2922#elif (SIZEOF_LONG == 4)
2923# define FAST_CHAR_MASK 0x80008000L
2924# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2925#else
2926# error C 'long' size should be either 4 or 8!
2927#endif
2928
Walter Dörwald69652032004-09-07 20:24:22 +00002929PyObject *
2930PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 Py_ssize_t size,
2932 const char *errors,
2933 int *byteorder,
2934 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002935{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002937 Py_ssize_t startinpos;
2938 Py_ssize_t endinpos;
2939 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 PyUnicodeObject *unicode;
2941 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002942 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002943 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002944 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002945 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002946 /* Offsets from q for retrieving byte pairs in the right order. */
2947#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2948 int ihi = 1, ilo = 0;
2949#else
2950 int ihi = 0, ilo = 1;
2951#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 PyObject *errorHandler = NULL;
2953 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
2955 /* Note: size will always be longer than the resulting Unicode
2956 character count */
2957 unicode = _PyUnicode_New(size);
2958 if (!unicode)
2959 return NULL;
2960 if (size == 0)
2961 return (PyObject *)unicode;
2962
2963 /* Unpack UTF-16 encoded data */
2964 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002965 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002966 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967
2968 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002969 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002971 /* Check for BOM marks (U+FEFF) in the input and adjust current
2972 byte order setting accordingly. In native mode, the leading BOM
2973 mark is skipped, in all other modes, it is copied to the output
2974 stream as-is (giving a ZWNBSP character). */
2975 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002976 if (size >= 2) {
2977 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002978#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 if (bom == 0xFEFF) {
2980 q += 2;
2981 bo = -1;
2982 }
2983 else if (bom == 0xFFFE) {
2984 q += 2;
2985 bo = 1;
2986 }
Tim Petersced69f82003-09-16 20:30:58 +00002987#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 if (bom == 0xFEFF) {
2989 q += 2;
2990 bo = 1;
2991 }
2992 else if (bom == 0xFFFE) {
2993 q += 2;
2994 bo = -1;
2995 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002996#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999
Tim Peters772747b2001-08-09 22:21:55 +00003000 if (bo == -1) {
3001 /* force LE */
3002 ihi = 1;
3003 ilo = 0;
3004 }
3005 else if (bo == 1) {
3006 /* force BE */
3007 ihi = 0;
3008 ilo = 1;
3009 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003010#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3011 native_ordering = ilo < ihi;
3012#else
3013 native_ordering = ilo > ihi;
3014#endif
Tim Peters772747b2001-08-09 22:21:55 +00003015
Antoine Pitrouab868312009-01-10 15:40:25 +00003016 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003017 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003019 /* First check for possible aligned read of a C 'long'. Unaligned
3020 reads are more expensive, better to defer to another iteration. */
3021 if (!((size_t) q & LONG_PTR_MASK)) {
3022 /* Fast path for runs of non-surrogate chars. */
3023 register const unsigned char *_q = q;
3024 Py_UNICODE *_p = p;
3025 if (native_ordering) {
3026 /* Native ordering is simple: as long as the input cannot
3027 possibly contain a surrogate char, do an unrolled copy
3028 of several 16-bit code points to the target object.
3029 The non-surrogate check is done on several input bytes
3030 at a time (as many as a C 'long' can contain). */
3031 while (_q < aligned_end) {
3032 unsigned long data = * (unsigned long *) _q;
3033 if (data & FAST_CHAR_MASK)
3034 break;
3035 _p[0] = ((unsigned short *) _q)[0];
3036 _p[1] = ((unsigned short *) _q)[1];
3037#if (SIZEOF_LONG == 8)
3038 _p[2] = ((unsigned short *) _q)[2];
3039 _p[3] = ((unsigned short *) _q)[3];
3040#endif
3041 _q += SIZEOF_LONG;
3042 _p += SIZEOF_LONG / 2;
3043 }
3044 }
3045 else {
3046 /* Byteswapped ordering is similar, but we must decompose
3047 the copy bytewise, and take care of zero'ing out the
3048 upper bytes if the target object is in 32-bit units
3049 (that is, in UCS-4 builds). */
3050 while (_q < aligned_end) {
3051 unsigned long data = * (unsigned long *) _q;
3052 if (data & SWAPPED_FAST_CHAR_MASK)
3053 break;
3054 /* Zero upper bytes in UCS-4 builds */
3055#if (Py_UNICODE_SIZE > 2)
3056 _p[0] = 0;
3057 _p[1] = 0;
3058#if (SIZEOF_LONG == 8)
3059 _p[2] = 0;
3060 _p[3] = 0;
3061#endif
3062#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003063 /* Issue #4916; UCS-4 builds on big endian machines must
3064 fill the two last bytes of each 4-byte unit. */
3065#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3066# define OFF 2
3067#else
3068# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003069#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003070 ((unsigned char *) _p)[OFF + 1] = _q[0];
3071 ((unsigned char *) _p)[OFF + 0] = _q[1];
3072 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3073 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3074#if (SIZEOF_LONG == 8)
3075 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3076 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3077 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3078 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3079#endif
3080#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003081 _q += SIZEOF_LONG;
3082 _p += SIZEOF_LONG / 2;
3083 }
3084 }
3085 p = _p;
3086 q = _q;
3087 if (q >= e)
3088 break;
3089 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091
Benjamin Peterson14339b62009-01-31 16:36:08 +00003092 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003093
3094 if (ch < 0xD800 || ch > 0xDFFF) {
3095 *p++ = ch;
3096 continue;
3097 }
3098
3099 /* UTF-16 code pair: */
3100 if (q > e) {
3101 errmsg = "unexpected end of data";
3102 startinpos = (((const char *)q) - 2) - starts;
3103 endinpos = ((const char *)e) + 1 - starts;
3104 goto utf16Error;
3105 }
3106 if (0xD800 <= ch && ch <= 0xDBFF) {
3107 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3108 q += 2;
3109 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003110#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 *p++ = ch;
3112 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003113#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003115#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 continue;
3117 }
3118 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003119 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 startinpos = (((const char *)q)-4)-starts;
3121 endinpos = startinpos+2;
3122 goto utf16Error;
3123 }
3124
Benjamin Peterson14339b62009-01-31 16:36:08 +00003125 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 errmsg = "illegal encoding";
3127 startinpos = (((const char *)q)-2)-starts;
3128 endinpos = startinpos+2;
3129 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003130
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 utf16Error:
3132 outpos = p - PyUnicode_AS_UNICODE(unicode);
3133 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003134 errors,
3135 &errorHandler,
3136 "utf16", errmsg,
3137 &starts,
3138 (const char **)&e,
3139 &startinpos,
3140 &endinpos,
3141 &exc,
3142 (const char **)&q,
3143 &unicode,
3144 &outpos,
3145 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003148 /* remaining byte at the end? (size should be even) */
3149 if (e == q) {
3150 if (!consumed) {
3151 errmsg = "truncated data";
3152 startinpos = ((const char *)q) - starts;
3153 endinpos = ((const char *)e) + 1 - starts;
3154 outpos = p - PyUnicode_AS_UNICODE(unicode);
3155 if (unicode_decode_call_errorhandler(
3156 errors,
3157 &errorHandler,
3158 "utf16", errmsg,
3159 &starts,
3160 (const char **)&e,
3161 &startinpos,
3162 &endinpos,
3163 &exc,
3164 (const char **)&q,
3165 &unicode,
3166 &outpos,
3167 &p))
3168 goto onError;
3169 /* The remaining input chars are ignored if the callback
3170 chooses to skip the input */
3171 }
3172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173
3174 if (byteorder)
3175 *byteorder = bo;
3176
Walter Dörwald69652032004-09-07 20:24:22 +00003177 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003179
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003181 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 goto onError;
3183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 Py_XDECREF(errorHandler);
3185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 return (PyObject *)unicode;
3187
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 Py_XDECREF(errorHandler);
3191 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 return NULL;
3193}
3194
Antoine Pitrouab868312009-01-10 15:40:25 +00003195#undef FAST_CHAR_MASK
3196#undef SWAPPED_FAST_CHAR_MASK
3197
Tim Peters772747b2001-08-09 22:21:55 +00003198PyObject *
3199PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 Py_ssize_t size,
3201 const char *errors,
3202 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003204 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003205 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003206 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003207#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003208 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003209#else
3210 const int pairs = 0;
3211#endif
Tim Peters772747b2001-08-09 22:21:55 +00003212 /* Offsets from p for storing byte pairs in the right order. */
3213#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3214 int ihi = 1, ilo = 0;
3215#else
3216 int ihi = 0, ilo = 1;
3217#endif
3218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219#define STORECHAR(CH) \
3220 do { \
3221 p[ihi] = ((CH) >> 8) & 0xff; \
3222 p[ilo] = (CH) & 0xff; \
3223 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003224 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003226#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003227 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (s[i] >= 0x10000)
3229 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003230#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003231 /* 2 * (size + pairs + (byteorder == 0)) */
3232 if (size > PY_SSIZE_T_MAX ||
3233 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003235 nsize = size + pairs + (byteorder == 0);
3236 bytesize = nsize * 2;
3237 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003239 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 if (v == NULL)
3241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003243 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003246 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003247 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003248
3249 if (byteorder == -1) {
3250 /* force LE */
3251 ihi = 1;
3252 ilo = 0;
3253 }
3254 else if (byteorder == 1) {
3255 /* force BE */
3256 ihi = 0;
3257 ilo = 1;
3258 }
3259
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003260 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 Py_UNICODE ch = *s++;
3262 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003263#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 if (ch >= 0x10000) {
3265 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3266 ch = 0xD800 | ((ch-0x10000) >> 10);
3267 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003268#endif
Tim Peters772747b2001-08-09 22:21:55 +00003269 STORECHAR(ch);
3270 if (ch2)
3271 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003272 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003273
3274 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003276#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277}
3278
3279PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3280{
3281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 PyUnicode_GET_SIZE(unicode),
3287 NULL,
3288 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289}
3290
3291/* --- Unicode Escape Codec ----------------------------------------------- */
3292
Fredrik Lundh06d12682001-01-24 07:59:11 +00003293static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003294
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 Py_ssize_t size,
3297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003300 Py_ssize_t startinpos;
3301 Py_ssize_t endinpos;
3302 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003307 char* message;
3308 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 PyObject *errorHandler = NULL;
3310 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 /* Escaped strings will always be longer than the resulting
3313 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 length after conversion to the true value.
3315 (but if the error callback returns a long replacement string
3316 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 v = _PyUnicode_New(size);
3318 if (v == NULL)
3319 goto onError;
3320 if (size == 0)
3321 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003322
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 while (s < end) {
3327 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003328 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Non-escape characters are interpreted as Unicode ordinals */
3332 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003333 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 continue;
3335 }
3336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 /* \ - Escapes */
3339 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003340 c = *s++;
3341 if (s > end)
3342 c = '\0'; /* Invalid after \ */
3343 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 case '\n': break;
3347 case '\\': *p++ = '\\'; break;
3348 case '\'': *p++ = '\''; break;
3349 case '\"': *p++ = '\"'; break;
3350 case 'b': *p++ = '\b'; break;
3351 case 'f': *p++ = '\014'; break; /* FF */
3352 case 't': *p++ = '\t'; break;
3353 case 'n': *p++ = '\n'; break;
3354 case 'r': *p++ = '\r'; break;
3355 case 'v': *p++ = '\013'; break; /* VT */
3356 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3357
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 case '0': case '1': case '2': case '3':
3360 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003361 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003362 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003363 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003364 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003365 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003367 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 break;
3369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 /* hex escapes */
3371 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003373 digits = 2;
3374 message = "truncated \\xXX escape";
3375 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003379 digits = 4;
3380 message = "truncated \\uXXXX escape";
3381 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003384 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003385 digits = 8;
3386 message = "truncated \\UXXXXXXXX escape";
3387 hexescape:
3388 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 outpos = p-PyUnicode_AS_UNICODE(v);
3390 if (s+digits>end) {
3391 endinpos = size;
3392 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 errors, &errorHandler,
3394 "unicodeescape", "end of string in escape sequence",
3395 &starts, &end, &startinpos, &endinpos, &exc, &s,
3396 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 goto onError;
3398 goto nextByte;
3399 }
3400 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003401 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003402 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 endinpos = (s+i+1)-starts;
3404 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 errors, &errorHandler,
3406 "unicodeescape", message,
3407 &starts, &end, &startinpos, &endinpos, &exc, &s,
3408 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003411 }
3412 chr = (chr<<4) & ~0xF;
3413 if (c >= '0' && c <= '9')
3414 chr += c - '0';
3415 else if (c >= 'a' && c <= 'f')
3416 chr += 10 + c - 'a';
3417 else
3418 chr += 10 + c - 'A';
3419 }
3420 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003421 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 /* _decoding_error will have already written into the
3423 target buffer. */
3424 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003425 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003426 /* when we get here, chr is a 32-bit unicode character */
3427 if (chr <= 0xffff)
3428 /* UCS-2 character */
3429 *p++ = (Py_UNICODE) chr;
3430 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003431 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003432 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003433#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003434 *p++ = chr;
3435#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003436 chr -= 0x10000L;
3437 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003438 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003439#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003440 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 endinpos = s-starts;
3442 outpos = p-PyUnicode_AS_UNICODE(v);
3443 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 errors, &errorHandler,
3445 "unicodeescape", "illegal Unicode character",
3446 &starts, &end, &startinpos, &endinpos, &exc, &s,
3447 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003448 goto onError;
3449 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003450 break;
3451
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003453 case 'N':
3454 message = "malformed \\N character escape";
3455 if (ucnhash_CAPI == NULL) {
3456 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003457 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003458 if (ucnhash_CAPI == NULL)
3459 goto ucnhashError;
3460 }
3461 if (*s == '{') {
3462 const char *start = s+1;
3463 /* look for the closing brace */
3464 while (*s != '}' && s < end)
3465 s++;
3466 if (s > start && s < end && *s == '}') {
3467 /* found a name. look it up in the unicode database */
3468 message = "unknown Unicode character name";
3469 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003470 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003471 goto store;
3472 }
3473 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 endinpos = s-starts;
3475 outpos = p-PyUnicode_AS_UNICODE(v);
3476 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003477 errors, &errorHandler,
3478 "unicodeescape", message,
3479 &starts, &end, &startinpos, &endinpos, &exc, &s,
3480 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003481 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003482 break;
3483
3484 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003485 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 message = "\\ at end of string";
3487 s--;
3488 endinpos = s-starts;
3489 outpos = p-PyUnicode_AS_UNICODE(v);
3490 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 errors, &errorHandler,
3492 "unicodeescape", message,
3493 &starts, &end, &startinpos, &endinpos, &exc, &s,
3494 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003495 goto onError;
3496 }
3497 else {
3498 *p++ = '\\';
3499 *p++ = (unsigned char)s[-1];
3500 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003501 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003506 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003508 Py_XDECREF(errorHandler);
3509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003511
Benjamin Peterson29060642009-01-31 22:14:21 +00003512 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003513 PyErr_SetString(
3514 PyExc_UnicodeError,
3515 "\\N escapes not supported (can't load unicodedata module)"
3516 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003517 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 Py_XDECREF(errorHandler);
3519 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003520 return NULL;
3521
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 Py_XDECREF(errorHandler);
3525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 return NULL;
3527}
3528
3529/* Return a Unicode-Escape string version of the Unicode object.
3530
3531 If quotes is true, the string is enclosed in u"" or u'' quotes as
3532 appropriate.
3533
3534*/
3535
Thomas Wouters477c8d52006-05-27 19:21:47 +00003536Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 Py_ssize_t size,
3538 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003539{
3540 /* like wcschr, but doesn't stop at NULL characters */
3541
3542 while (size-- > 0) {
3543 if (*s == ch)
3544 return s;
3545 s++;
3546 }
3547
3548 return NULL;
3549}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003550
Walter Dörwald79e913e2007-05-12 11:08:06 +00003551static const char *hexdigits = "0123456789abcdef";
3552
3553PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003556 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003559#ifdef Py_UNICODE_WIDE
3560 const Py_ssize_t expandsize = 10;
3561#else
3562 const Py_ssize_t expandsize = 6;
3563#endif
3564
Thomas Wouters89f507f2006-12-13 04:49:30 +00003565 /* XXX(nnorwitz): rather than over-allocating, it would be
3566 better to choose a different scheme. Perhaps scan the
3567 first N-chars of the string and allocate based on that size.
3568 */
3569 /* Initial allocation is based on the longest-possible unichr
3570 escape.
3571
3572 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3573 unichr, so in this case it's the longest unichr escape. In
3574 narrow (UTF-16) builds this is five chars per source unichr
3575 since there are two unichrs in the surrogate pair, so in narrow
3576 (UTF-16) builds it's not the longest unichr escape.
3577
3578 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3579 so in the narrow (UTF-16) build case it's the longest unichr
3580 escape.
3581 */
3582
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003583 if (size == 0)
3584 return PyBytes_FromStringAndSize(NULL, 0);
3585
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003586 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003588
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003589 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 2
3591 + expandsize*size
3592 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 if (repr == NULL)
3594 return NULL;
3595
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003596 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 while (size-- > 0) {
3599 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003600
Walter Dörwald79e913e2007-05-12 11:08:06 +00003601 /* Escape backslashes */
3602 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 *p++ = '\\';
3604 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003605 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003606 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003607
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003608#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003609 /* Map 21-bit characters to '\U00xxxxxx' */
3610 else if (ch >= 0x10000) {
3611 *p++ = '\\';
3612 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003613 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3614 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3615 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3616 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3617 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3618 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3619 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3620 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003622 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003623#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3625 else if (ch >= 0xD800 && ch < 0xDC00) {
3626 Py_UNICODE ch2;
3627 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003628
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 ch2 = *s++;
3630 size--;
3631 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3632 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3633 *p++ = '\\';
3634 *p++ = 'U';
3635 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3636 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3637 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3638 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3639 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3640 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3641 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3642 *p++ = hexdigits[ucs & 0x0000000F];
3643 continue;
3644 }
3645 /* Fall through: isolated surrogates are copied as-is */
3646 s--;
3647 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003648 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003649#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003652 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 *p++ = '\\';
3654 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003655 *p++ = hexdigits[(ch >> 12) & 0x000F];
3656 *p++ = hexdigits[(ch >> 8) & 0x000F];
3657 *p++ = hexdigits[(ch >> 4) & 0x000F];
3658 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003660
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003661 /* Map special whitespace to '\t', \n', '\r' */
3662 else if (ch == '\t') {
3663 *p++ = '\\';
3664 *p++ = 't';
3665 }
3666 else if (ch == '\n') {
3667 *p++ = '\\';
3668 *p++ = 'n';
3669 }
3670 else if (ch == '\r') {
3671 *p++ = '\\';
3672 *p++ = 'r';
3673 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003674
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003675 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003676 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003678 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003679 *p++ = hexdigits[(ch >> 4) & 0x000F];
3680 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003681 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 /* Copy everything else as-is */
3684 else
3685 *p++ = (char) ch;
3686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003688 assert(p - PyBytes_AS_STRING(repr) > 0);
3689 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3690 return NULL;
3691 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692}
3693
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003694PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003696 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
3699 return NULL;
3700 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003701 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3702 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003703 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704}
3705
3706/* --- Raw Unicode Escape Codec ------------------------------------------- */
3707
3708PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 Py_ssize_t size,
3710 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003713 Py_ssize_t startinpos;
3714 Py_ssize_t endinpos;
3715 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 const char *end;
3719 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 PyObject *errorHandler = NULL;
3721 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 /* Escaped strings will always be longer than the resulting
3724 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 length after conversion to the true value. (But decoding error
3726 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 v = _PyUnicode_New(size);
3728 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 end = s + size;
3734 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 unsigned char c;
3736 Py_UCS4 x;
3737 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003738 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 /* Non-escape characters are interpreted as Unicode ordinals */
3741 if (*s != '\\') {
3742 *p++ = (unsigned char)*s++;
3743 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 startinpos = s-starts;
3746
3747 /* \u-escapes are only interpreted iff the number of leading
3748 backslashes if odd */
3749 bs = s;
3750 for (;s < end;) {
3751 if (*s != '\\')
3752 break;
3753 *p++ = (unsigned char)*s++;
3754 }
3755 if (((s - bs) & 1) == 0 ||
3756 s >= end ||
3757 (*s != 'u' && *s != 'U')) {
3758 continue;
3759 }
3760 p--;
3761 count = *s=='u' ? 4 : 8;
3762 s++;
3763
3764 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3765 outpos = p-PyUnicode_AS_UNICODE(v);
3766 for (x = 0, i = 0; i < count; ++i, ++s) {
3767 c = (unsigned char)*s;
3768 if (!ISXDIGIT(c)) {
3769 endinpos = s-starts;
3770 if (unicode_decode_call_errorhandler(
3771 errors, &errorHandler,
3772 "rawunicodeescape", "truncated \\uXXXX",
3773 &starts, &end, &startinpos, &endinpos, &exc, &s,
3774 &v, &outpos, &p))
3775 goto onError;
3776 goto nextByte;
3777 }
3778 x = (x<<4) & ~0xF;
3779 if (c >= '0' && c <= '9')
3780 x += c - '0';
3781 else if (c >= 'a' && c <= 'f')
3782 x += 10 + c - 'a';
3783 else
3784 x += 10 + c - 'A';
3785 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003786 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 /* UCS-2 character */
3788 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003789 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* UCS-4 character. Either store directly, or as
3791 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003792#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003794#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 x -= 0x10000L;
3796 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3797 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003798#endif
3799 } else {
3800 endinpos = s-starts;
3801 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003802 if (unicode_decode_call_errorhandler(
3803 errors, &errorHandler,
3804 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 &starts, &end, &startinpos, &endinpos, &exc, &s,
3806 &v, &outpos, &p))
3807 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003808 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003809 nextByte:
3810 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003812 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 Py_XDECREF(errorHandler);
3815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 Py_XDECREF(errorHandler);
3821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 return NULL;
3823}
3824
3825PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003828 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 char *p;
3830 char *q;
3831
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003832#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003833 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003834#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003835 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003836#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003837
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003838 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003840
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003841 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 if (repr == NULL)
3843 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003844 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003845 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003847 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 while (size-- > 0) {
3849 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003850#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 /* Map 32-bit characters to '\Uxxxxxxxx' */
3852 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003853 *p++ = '\\';
3854 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003855 *p++ = hexdigits[(ch >> 28) & 0xf];
3856 *p++ = hexdigits[(ch >> 24) & 0xf];
3857 *p++ = hexdigits[(ch >> 20) & 0xf];
3858 *p++ = hexdigits[(ch >> 16) & 0xf];
3859 *p++ = hexdigits[(ch >> 12) & 0xf];
3860 *p++ = hexdigits[(ch >> 8) & 0xf];
3861 *p++ = hexdigits[(ch >> 4) & 0xf];
3862 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003863 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003864 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003865#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3867 if (ch >= 0xD800 && ch < 0xDC00) {
3868 Py_UNICODE ch2;
3869 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003870
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 ch2 = *s++;
3872 size--;
3873 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3874 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3875 *p++ = '\\';
3876 *p++ = 'U';
3877 *p++ = hexdigits[(ucs >> 28) & 0xf];
3878 *p++ = hexdigits[(ucs >> 24) & 0xf];
3879 *p++ = hexdigits[(ucs >> 20) & 0xf];
3880 *p++ = hexdigits[(ucs >> 16) & 0xf];
3881 *p++ = hexdigits[(ucs >> 12) & 0xf];
3882 *p++ = hexdigits[(ucs >> 8) & 0xf];
3883 *p++ = hexdigits[(ucs >> 4) & 0xf];
3884 *p++ = hexdigits[ucs & 0xf];
3885 continue;
3886 }
3887 /* Fall through: isolated surrogates are copied as-is */
3888 s--;
3889 size++;
3890 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003891#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 /* Map 16-bit characters to '\uxxxx' */
3893 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 *p++ = '\\';
3895 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003896 *p++ = hexdigits[(ch >> 12) & 0xf];
3897 *p++ = hexdigits[(ch >> 8) & 0xf];
3898 *p++ = hexdigits[(ch >> 4) & 0xf];
3899 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 /* Copy everything else as-is */
3902 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 *p++ = (char) ch;
3904 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003905 size = p - q;
3906
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003907 assert(size > 0);
3908 if (_PyBytes_Resize(&repr, size) < 0)
3909 return NULL;
3910 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911}
3912
3913PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3914{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003915 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003917 PyErr_BadArgument();
3918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003920 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3921 PyUnicode_GET_SIZE(unicode));
3922
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003923 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924}
3925
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003926/* --- Unicode Internal Codec ------------------------------------------- */
3927
3928PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 Py_ssize_t size,
3930 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003931{
3932 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t startinpos;
3934 Py_ssize_t endinpos;
3935 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003936 PyUnicodeObject *v;
3937 Py_UNICODE *p;
3938 const char *end;
3939 const char *reason;
3940 PyObject *errorHandler = NULL;
3941 PyObject *exc = NULL;
3942
Neal Norwitzd43069c2006-01-08 01:12:10 +00003943#ifdef Py_UNICODE_WIDE
3944 Py_UNICODE unimax = PyUnicode_GetMax();
3945#endif
3946
Thomas Wouters89f507f2006-12-13 04:49:30 +00003947 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003948 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3949 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003951 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003953 p = PyUnicode_AS_UNICODE(v);
3954 end = s + size;
3955
3956 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003957 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003958 /* We have to sanity check the raw data, otherwise doom looms for
3959 some malformed UCS-4 data. */
3960 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003961#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003962 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003963#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003964 end-s < Py_UNICODE_SIZE
3965 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003967 startinpos = s - starts;
3968 if (end-s < Py_UNICODE_SIZE) {
3969 endinpos = end-starts;
3970 reason = "truncated input";
3971 }
3972 else {
3973 endinpos = s - starts + Py_UNICODE_SIZE;
3974 reason = "illegal code point (> 0x10FFFF)";
3975 }
3976 outpos = p - PyUnicode_AS_UNICODE(v);
3977 if (unicode_decode_call_errorhandler(
3978 errors, &errorHandler,
3979 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003980 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003981 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003982 goto onError;
3983 }
3984 }
3985 else {
3986 p++;
3987 s += Py_UNICODE_SIZE;
3988 }
3989 }
3990
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003991 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003992 goto onError;
3993 Py_XDECREF(errorHandler);
3994 Py_XDECREF(exc);
3995 return (PyObject *)v;
3996
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003998 Py_XDECREF(v);
3999 Py_XDECREF(errorHandler);
4000 Py_XDECREF(exc);
4001 return NULL;
4002}
4003
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004/* --- Latin-1 Codec ------------------------------------------------------ */
4005
4006PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 Py_ssize_t size,
4008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009{
4010 PyUnicodeObject *v;
4011 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004012 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004013
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004015 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 Py_UNICODE r = *(unsigned char*)s;
4017 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004018 }
4019
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 v = _PyUnicode_New(size);
4021 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004026 e = s + size;
4027 /* Unrolling the copy makes it much faster by reducing the looping
4028 overhead. This is similar to what many memcpy() implementations do. */
4029 unrolled_end = e - 4;
4030 while (s < unrolled_end) {
4031 p[0] = (unsigned char) s[0];
4032 p[1] = (unsigned char) s[1];
4033 p[2] = (unsigned char) s[2];
4034 p[3] = (unsigned char) s[3];
4035 s += 4;
4036 p += 4;
4037 }
4038 while (s < e)
4039 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004041
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 Py_XDECREF(v);
4044 return NULL;
4045}
4046
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047/* create or adjust a UnicodeEncodeError */
4048static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 const char *encoding,
4050 const Py_UNICODE *unicode, Py_ssize_t size,
4051 Py_ssize_t startpos, Py_ssize_t endpos,
4052 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 *exceptionObject = PyUnicodeEncodeError_Create(
4056 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 }
4058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4060 goto onError;
4061 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4062 goto onError;
4063 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4064 goto onError;
4065 return;
4066 onError:
4067 Py_DECREF(*exceptionObject);
4068 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
4070}
4071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072/* raises a UnicodeEncodeError */
4073static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 const char *encoding,
4075 const Py_UNICODE *unicode, Py_ssize_t size,
4076 Py_ssize_t startpos, Py_ssize_t endpos,
4077 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078{
4079 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083}
4084
4085/* error handling callback helper:
4086 build arguments, call the callback and check the arguments,
4087 put the result into newpos and return the replacement string, which
4088 has to be freed by the caller */
4089static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 PyObject **errorHandler,
4091 const char *encoding, const char *reason,
4092 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4093 Py_ssize_t startpos, Py_ssize_t endpos,
4094 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004096 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097
4098 PyObject *restuple;
4099 PyObject *resunicode;
4100
4101 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 }
4106
4107 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111
4112 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004117 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 Py_DECREF(restuple);
4119 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004121 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 &resunicode, newpos)) {
4123 Py_DECREF(restuple);
4124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004126 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4127 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4128 Py_DECREF(restuple);
4129 return NULL;
4130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004133 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4135 Py_DECREF(restuple);
4136 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 Py_INCREF(resunicode);
4139 Py_DECREF(restuple);
4140 return resunicode;
4141}
4142
4143static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 Py_ssize_t size,
4145 const char *errors,
4146 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147{
4148 /* output object */
4149 PyObject *res;
4150 /* pointers to the beginning and end+1 of input */
4151 const Py_UNICODE *startp = p;
4152 const Py_UNICODE *endp = p + size;
4153 /* pointer to the beginning of the unencodable characters */
4154 /* const Py_UNICODE *badp = NULL; */
4155 /* pointer into the output */
4156 char *str;
4157 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004158 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004159 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4160 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 PyObject *errorHandler = NULL;
4162 PyObject *exc = NULL;
4163 /* the following variable is used for caching string comparisons
4164 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4165 int known_errorHandler = -1;
4166
4167 /* allocate enough for a simple encoding without
4168 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004169 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004170 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004171 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004173 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004174 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 ressize = size;
4176
4177 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 /* can we encode this? */
4181 if (c<limit) {
4182 /* no overflow check, because we know that the space is enough */
4183 *str++ = (char)c;
4184 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 else {
4187 Py_ssize_t unicodepos = p-startp;
4188 Py_ssize_t requiredsize;
4189 PyObject *repunicode;
4190 Py_ssize_t repsize;
4191 Py_ssize_t newpos;
4192 Py_ssize_t respos;
4193 Py_UNICODE *uni2;
4194 /* startpos for collecting unencodable chars */
4195 const Py_UNICODE *collstart = p;
4196 const Py_UNICODE *collend = p;
4197 /* find all unecodable characters */
4198 while ((collend < endp) && ((*collend)>=limit))
4199 ++collend;
4200 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4201 if (known_errorHandler==-1) {
4202 if ((errors==NULL) || (!strcmp(errors, "strict")))
4203 known_errorHandler = 1;
4204 else if (!strcmp(errors, "replace"))
4205 known_errorHandler = 2;
4206 else if (!strcmp(errors, "ignore"))
4207 known_errorHandler = 3;
4208 else if (!strcmp(errors, "xmlcharrefreplace"))
4209 known_errorHandler = 4;
4210 else
4211 known_errorHandler = 0;
4212 }
4213 switch (known_errorHandler) {
4214 case 1: /* strict */
4215 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4216 goto onError;
4217 case 2: /* replace */
4218 while (collstart++<collend)
4219 *str++ = '?'; /* fall through */
4220 case 3: /* ignore */
4221 p = collend;
4222 break;
4223 case 4: /* xmlcharrefreplace */
4224 respos = str - PyBytes_AS_STRING(res);
4225 /* determine replacement size (temporarily (mis)uses p) */
4226 for (p = collstart, repsize = 0; p < collend; ++p) {
4227 if (*p<10)
4228 repsize += 2+1+1;
4229 else if (*p<100)
4230 repsize += 2+2+1;
4231 else if (*p<1000)
4232 repsize += 2+3+1;
4233 else if (*p<10000)
4234 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004235#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 else
4237 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004238#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 else if (*p<100000)
4240 repsize += 2+5+1;
4241 else if (*p<1000000)
4242 repsize += 2+6+1;
4243 else
4244 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 }
4247 requiredsize = respos+repsize+(endp-collend);
4248 if (requiredsize > ressize) {
4249 if (requiredsize<2*ressize)
4250 requiredsize = 2*ressize;
4251 if (_PyBytes_Resize(&res, requiredsize))
4252 goto onError;
4253 str = PyBytes_AS_STRING(res) + respos;
4254 ressize = requiredsize;
4255 }
4256 /* generate replacement (temporarily (mis)uses p) */
4257 for (p = collstart; p < collend; ++p) {
4258 str += sprintf(str, "&#%d;", (int)*p);
4259 }
4260 p = collend;
4261 break;
4262 default:
4263 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4264 encoding, reason, startp, size, &exc,
4265 collstart-startp, collend-startp, &newpos);
4266 if (repunicode == NULL)
4267 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004268 if (PyBytes_Check(repunicode)) {
4269 /* Directly copy bytes result to output. */
4270 repsize = PyBytes_Size(repunicode);
4271 if (repsize > 1) {
4272 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004273 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004274 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4275 Py_DECREF(repunicode);
4276 goto onError;
4277 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004278 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004279 ressize += repsize-1;
4280 }
4281 memcpy(str, PyBytes_AsString(repunicode), repsize);
4282 str += repsize;
4283 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004284 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004285 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004286 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 /* need more space? (at least enough for what we
4288 have+the replacement+the rest of the string, so
4289 we won't have to check space for encodable characters) */
4290 respos = str - PyBytes_AS_STRING(res);
4291 repsize = PyUnicode_GET_SIZE(repunicode);
4292 requiredsize = respos+repsize+(endp-collend);
4293 if (requiredsize > ressize) {
4294 if (requiredsize<2*ressize)
4295 requiredsize = 2*ressize;
4296 if (_PyBytes_Resize(&res, requiredsize)) {
4297 Py_DECREF(repunicode);
4298 goto onError;
4299 }
4300 str = PyBytes_AS_STRING(res) + respos;
4301 ressize = requiredsize;
4302 }
4303 /* check if there is anything unencodable in the replacement
4304 and copy it to the output */
4305 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4306 c = *uni2;
4307 if (c >= limit) {
4308 raise_encode_exception(&exc, encoding, startp, size,
4309 unicodepos, unicodepos+1, reason);
4310 Py_DECREF(repunicode);
4311 goto onError;
4312 }
4313 *str = (char)c;
4314 }
4315 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004316 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004318 }
4319 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004320 /* Resize if we allocated to much */
4321 size = str - PyBytes_AS_STRING(res);
4322 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004323 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004324 if (_PyBytes_Resize(&res, size) < 0)
4325 goto onError;
4326 }
4327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004330 return res;
4331
4332 onError:
4333 Py_XDECREF(res);
4334 Py_XDECREF(errorHandler);
4335 Py_XDECREF(exc);
4336 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337}
4338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 Py_ssize_t size,
4341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344}
4345
4346PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4347{
4348 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 PyErr_BadArgument();
4350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 }
4352 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 PyUnicode_GET_SIZE(unicode),
4354 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355}
4356
4357/* --- 7-bit ASCII Codec -------------------------------------------------- */
4358
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 Py_ssize_t size,
4361 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 PyUnicodeObject *v;
4365 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004366 Py_ssize_t startinpos;
4367 Py_ssize_t endinpos;
4368 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 const char *e;
4370 PyObject *errorHandler = NULL;
4371 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004374 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 Py_UNICODE r = *(unsigned char*)s;
4376 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004377 }
Tim Petersced69f82003-09-16 20:30:58 +00004378
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 v = _PyUnicode_New(size);
4380 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 e = s + size;
4386 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 register unsigned char c = (unsigned char)*s;
4388 if (c < 128) {
4389 *p++ = c;
4390 ++s;
4391 }
4392 else {
4393 startinpos = s-starts;
4394 endinpos = startinpos + 1;
4395 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4396 if (unicode_decode_call_errorhandler(
4397 errors, &errorHandler,
4398 "ascii", "ordinal not in range(128)",
4399 &starts, &e, &startinpos, &endinpos, &exc, &s,
4400 &v, &outpos, &p))
4401 goto onError;
4402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004404 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_XDECREF(errorHandler);
4408 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004410
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_XDECREF(errorHandler);
4414 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 return NULL;
4416}
4417
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_ssize_t size,
4420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423}
4424
4425PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4426{
4427 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 PyErr_BadArgument();
4429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 }
4431 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 PyUnicode_GET_SIZE(unicode),
4433 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434}
4435
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004436#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004437
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004438/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004439
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004440#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004441#define NEED_RETRY
4442#endif
4443
4444/* XXX This code is limited to "true" double-byte encodings, as
4445 a) it assumes an incomplete character consists of a single byte, and
4446 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004448
4449static int is_dbcs_lead_byte(const char *s, int offset)
4450{
4451 const char *curr = s + offset;
4452
4453 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 const char *prev = CharPrev(s, curr);
4455 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004456 }
4457 return 0;
4458}
4459
4460/*
4461 * Decode MBCS string into unicode object. If 'final' is set, converts
4462 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4463 */
4464static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 const char *s, /* MBCS string */
4466 int size, /* sizeof MBCS string */
4467 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004468{
4469 Py_UNICODE *p;
4470 Py_ssize_t n = 0;
4471 int usize = 0;
4472
4473 assert(size >= 0);
4474
4475 /* Skip trailing lead-byte unless 'final' is set */
4476 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004478
4479 /* First get the size of the result */
4480 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4482 if (usize == 0) {
4483 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4484 return -1;
4485 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004486 }
4487
4488 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 /* Create unicode object */
4490 *v = _PyUnicode_New(usize);
4491 if (*v == NULL)
4492 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004493 }
4494 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 /* Extend unicode object */
4496 n = PyUnicode_GET_SIZE(*v);
4497 if (_PyUnicode_Resize(v, n + usize) < 0)
4498 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004499 }
4500
4501 /* Do the conversion */
4502 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 p = PyUnicode_AS_UNICODE(*v) + n;
4504 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4505 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4506 return -1;
4507 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004508 }
4509
4510 return size;
4511}
4512
4513PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 Py_ssize_t size,
4515 const char *errors,
4516 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004517{
4518 PyUnicodeObject *v = NULL;
4519 int done;
4520
4521 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004523
4524#ifdef NEED_RETRY
4525 retry:
4526 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528 else
4529#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004531
4532 if (done < 0) {
4533 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004535 }
4536
4537 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004539
4540#ifdef NEED_RETRY
4541 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 s += done;
4543 size -= done;
4544 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545 }
4546#endif
4547
4548 return (PyObject *)v;
4549}
4550
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004551PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 Py_ssize_t size,
4553 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004554{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4556}
4557
4558/*
4559 * Convert unicode into string object (MBCS).
4560 * Returns 0 if succeed, -1 otherwise.
4561 */
4562static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 const Py_UNICODE *p, /* unicode */
4564 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565{
4566 int mbcssize = 0;
4567 Py_ssize_t n = 0;
4568
4569 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004570
4571 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4574 if (mbcssize == 0) {
4575 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4576 return -1;
4577 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004578 }
4579
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 /* Create string object */
4582 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4583 if (*repr == NULL)
4584 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004585 }
4586 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 /* Extend string object */
4588 n = PyBytes_Size(*repr);
4589 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4590 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004591 }
4592
4593 /* Do the conversion */
4594 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 char *s = PyBytes_AS_STRING(*repr) + n;
4596 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4597 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4598 return -1;
4599 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004600 }
4601
4602 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004603}
4604
4605PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 Py_ssize_t size,
4607 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004608{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004609 PyObject *repr = NULL;
4610 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004611
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616 else
4617#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004619
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 Py_XDECREF(repr);
4622 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004623 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004624
4625#ifdef NEED_RETRY
4626 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 p += INT_MAX;
4628 size -= INT_MAX;
4629 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004630 }
4631#endif
4632
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004633 return repr;
4634}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004635
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004636PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4637{
4638 if (!PyUnicode_Check(unicode)) {
4639 PyErr_BadArgument();
4640 return NULL;
4641 }
4642 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 PyUnicode_GET_SIZE(unicode),
4644 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004645}
4646
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004647#undef NEED_RETRY
4648
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004649#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004650
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651/* --- Character Mapping Codec -------------------------------------------- */
4652
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 Py_ssize_t size,
4655 PyObject *mapping,
4656 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004659 Py_ssize_t startinpos;
4660 Py_ssize_t endinpos;
4661 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 PyUnicodeObject *v;
4664 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004665 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 PyObject *errorHandler = NULL;
4667 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004668 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 /* Default to Latin-1 */
4672 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674
4675 v = _PyUnicode_New(size);
4676 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004682 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 mapstring = PyUnicode_AS_UNICODE(mapping);
4684 maplen = PyUnicode_GET_SIZE(mapping);
4685 while (s < e) {
4686 unsigned char ch = *s;
4687 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 if (ch < maplen)
4690 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 if (x == 0xfffe) {
4693 /* undefined mapping */
4694 outpos = p-PyUnicode_AS_UNICODE(v);
4695 startinpos = s-starts;
4696 endinpos = startinpos+1;
4697 if (unicode_decode_call_errorhandler(
4698 errors, &errorHandler,
4699 "charmap", "character maps to <undefined>",
4700 &starts, &e, &startinpos, &endinpos, &exc, &s,
4701 &v, &outpos, &p)) {
4702 goto onError;
4703 }
4704 continue;
4705 }
4706 *p++ = x;
4707 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004708 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004709 }
4710 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 while (s < e) {
4712 unsigned char ch = *s;
4713 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004714
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4716 w = PyLong_FromLong((long)ch);
4717 if (w == NULL)
4718 goto onError;
4719 x = PyObject_GetItem(mapping, w);
4720 Py_DECREF(w);
4721 if (x == NULL) {
4722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4723 /* No mapping found means: mapping is undefined. */
4724 PyErr_Clear();
4725 x = Py_None;
4726 Py_INCREF(x);
4727 } else
4728 goto onError;
4729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 /* Apply mapping */
4732 if (PyLong_Check(x)) {
4733 long value = PyLong_AS_LONG(x);
4734 if (value < 0 || value > 65535) {
4735 PyErr_SetString(PyExc_TypeError,
4736 "character mapping must be in range(65536)");
4737 Py_DECREF(x);
4738 goto onError;
4739 }
4740 *p++ = (Py_UNICODE)value;
4741 }
4742 else if (x == Py_None) {
4743 /* undefined mapping */
4744 outpos = p-PyUnicode_AS_UNICODE(v);
4745 startinpos = s-starts;
4746 endinpos = startinpos+1;
4747 if (unicode_decode_call_errorhandler(
4748 errors, &errorHandler,
4749 "charmap", "character maps to <undefined>",
4750 &starts, &e, &startinpos, &endinpos, &exc, &s,
4751 &v, &outpos, &p)) {
4752 Py_DECREF(x);
4753 goto onError;
4754 }
4755 Py_DECREF(x);
4756 continue;
4757 }
4758 else if (PyUnicode_Check(x)) {
4759 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004760
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 if (targetsize == 1)
4762 /* 1-1 mapping */
4763 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004764
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 else if (targetsize > 1) {
4766 /* 1-n mapping */
4767 if (targetsize > extrachars) {
4768 /* resize first */
4769 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4770 Py_ssize_t needed = (targetsize - extrachars) + \
4771 (targetsize << 2);
4772 extrachars += needed;
4773 /* XXX overflow detection missing */
4774 if (_PyUnicode_Resize(&v,
4775 PyUnicode_GET_SIZE(v) + needed) < 0) {
4776 Py_DECREF(x);
4777 goto onError;
4778 }
4779 p = PyUnicode_AS_UNICODE(v) + oldpos;
4780 }
4781 Py_UNICODE_COPY(p,
4782 PyUnicode_AS_UNICODE(x),
4783 targetsize);
4784 p += targetsize;
4785 extrachars -= targetsize;
4786 }
4787 /* 1-0 mapping: skip the character */
4788 }
4789 else {
4790 /* wrong return value */
4791 PyErr_SetString(PyExc_TypeError,
4792 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004793 Py_DECREF(x);
4794 goto onError;
4795 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 Py_DECREF(x);
4797 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 }
4800 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004806
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 Py_XDECREF(errorHandler);
4809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 Py_XDECREF(v);
4811 return NULL;
4812}
4813
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004814/* Charmap encoding: the lookup table */
4815
4816struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 PyObject_HEAD
4818 unsigned char level1[32];
4819 int count2, count3;
4820 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004821};
4822
4823static PyObject*
4824encoding_map_size(PyObject *obj, PyObject* args)
4825{
4826 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004827 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004829}
4830
4831static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004832 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 PyDoc_STR("Return the size (in bytes) of this object") },
4834 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004835};
4836
4837static void
4838encoding_map_dealloc(PyObject* o)
4839{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004841}
4842
4843static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004844 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 "EncodingMap", /*tp_name*/
4846 sizeof(struct encoding_map), /*tp_basicsize*/
4847 0, /*tp_itemsize*/
4848 /* methods */
4849 encoding_map_dealloc, /*tp_dealloc*/
4850 0, /*tp_print*/
4851 0, /*tp_getattr*/
4852 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004853 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 0, /*tp_repr*/
4855 0, /*tp_as_number*/
4856 0, /*tp_as_sequence*/
4857 0, /*tp_as_mapping*/
4858 0, /*tp_hash*/
4859 0, /*tp_call*/
4860 0, /*tp_str*/
4861 0, /*tp_getattro*/
4862 0, /*tp_setattro*/
4863 0, /*tp_as_buffer*/
4864 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4865 0, /*tp_doc*/
4866 0, /*tp_traverse*/
4867 0, /*tp_clear*/
4868 0, /*tp_richcompare*/
4869 0, /*tp_weaklistoffset*/
4870 0, /*tp_iter*/
4871 0, /*tp_iternext*/
4872 encoding_map_methods, /*tp_methods*/
4873 0, /*tp_members*/
4874 0, /*tp_getset*/
4875 0, /*tp_base*/
4876 0, /*tp_dict*/
4877 0, /*tp_descr_get*/
4878 0, /*tp_descr_set*/
4879 0, /*tp_dictoffset*/
4880 0, /*tp_init*/
4881 0, /*tp_alloc*/
4882 0, /*tp_new*/
4883 0, /*tp_free*/
4884 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004885};
4886
4887PyObject*
4888PyUnicode_BuildEncodingMap(PyObject* string)
4889{
4890 Py_UNICODE *decode;
4891 PyObject *result;
4892 struct encoding_map *mresult;
4893 int i;
4894 int need_dict = 0;
4895 unsigned char level1[32];
4896 unsigned char level2[512];
4897 unsigned char *mlevel1, *mlevel2, *mlevel3;
4898 int count2 = 0, count3 = 0;
4899
4900 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4901 PyErr_BadArgument();
4902 return NULL;
4903 }
4904 decode = PyUnicode_AS_UNICODE(string);
4905 memset(level1, 0xFF, sizeof level1);
4906 memset(level2, 0xFF, sizeof level2);
4907
4908 /* If there isn't a one-to-one mapping of NULL to \0,
4909 or if there are non-BMP characters, we need to use
4910 a mapping dictionary. */
4911 if (decode[0] != 0)
4912 need_dict = 1;
4913 for (i = 1; i < 256; i++) {
4914 int l1, l2;
4915 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004916#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004917 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004918#endif
4919 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004920 need_dict = 1;
4921 break;
4922 }
4923 if (decode[i] == 0xFFFE)
4924 /* unmapped character */
4925 continue;
4926 l1 = decode[i] >> 11;
4927 l2 = decode[i] >> 7;
4928 if (level1[l1] == 0xFF)
4929 level1[l1] = count2++;
4930 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004931 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004932 }
4933
4934 if (count2 >= 0xFF || count3 >= 0xFF)
4935 need_dict = 1;
4936
4937 if (need_dict) {
4938 PyObject *result = PyDict_New();
4939 PyObject *key, *value;
4940 if (!result)
4941 return NULL;
4942 for (i = 0; i < 256; i++) {
4943 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004944 key = PyLong_FromLong(decode[i]);
4945 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004946 if (!key || !value)
4947 goto failed1;
4948 if (PyDict_SetItem(result, key, value) == -1)
4949 goto failed1;
4950 Py_DECREF(key);
4951 Py_DECREF(value);
4952 }
4953 return result;
4954 failed1:
4955 Py_XDECREF(key);
4956 Py_XDECREF(value);
4957 Py_DECREF(result);
4958 return NULL;
4959 }
4960
4961 /* Create a three-level trie */
4962 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4963 16*count2 + 128*count3 - 1);
4964 if (!result)
4965 return PyErr_NoMemory();
4966 PyObject_Init(result, &EncodingMapType);
4967 mresult = (struct encoding_map*)result;
4968 mresult->count2 = count2;
4969 mresult->count3 = count3;
4970 mlevel1 = mresult->level1;
4971 mlevel2 = mresult->level23;
4972 mlevel3 = mresult->level23 + 16*count2;
4973 memcpy(mlevel1, level1, 32);
4974 memset(mlevel2, 0xFF, 16*count2);
4975 memset(mlevel3, 0, 128*count3);
4976 count3 = 0;
4977 for (i = 1; i < 256; i++) {
4978 int o1, o2, o3, i2, i3;
4979 if (decode[i] == 0xFFFE)
4980 /* unmapped character */
4981 continue;
4982 o1 = decode[i]>>11;
4983 o2 = (decode[i]>>7) & 0xF;
4984 i2 = 16*mlevel1[o1] + o2;
4985 if (mlevel2[i2] == 0xFF)
4986 mlevel2[i2] = count3++;
4987 o3 = decode[i] & 0x7F;
4988 i3 = 128*mlevel2[i2] + o3;
4989 mlevel3[i3] = i;
4990 }
4991 return result;
4992}
4993
4994static int
4995encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4996{
4997 struct encoding_map *map = (struct encoding_map*)mapping;
4998 int l1 = c>>11;
4999 int l2 = (c>>7) & 0xF;
5000 int l3 = c & 0x7F;
5001 int i;
5002
5003#ifdef Py_UNICODE_WIDE
5004 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005006 }
5007#endif
5008 if (c == 0)
5009 return 0;
5010 /* level 1*/
5011 i = map->level1[l1];
5012 if (i == 0xFF) {
5013 return -1;
5014 }
5015 /* level 2*/
5016 i = map->level23[16*i+l2];
5017 if (i == 0xFF) {
5018 return -1;
5019 }
5020 /* level 3 */
5021 i = map->level23[16*map->count2 + 128*i + l3];
5022 if (i == 0) {
5023 return -1;
5024 }
5025 return i;
5026}
5027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028/* Lookup the character ch in the mapping. If the character
5029 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005030 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032{
Christian Heimes217cfd12007-12-02 14:31:20 +00005033 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 PyObject *x;
5035
5036 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 x = PyObject_GetItem(mapping, w);
5039 Py_DECREF(w);
5040 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5042 /* No mapping found means: mapping is undefined. */
5043 PyErr_Clear();
5044 x = Py_None;
5045 Py_INCREF(x);
5046 return x;
5047 } else
5048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005050 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005052 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 long value = PyLong_AS_LONG(x);
5054 if (value < 0 || value > 255) {
5055 PyErr_SetString(PyExc_TypeError,
5056 "character mapping must be in range(256)");
5057 Py_DECREF(x);
5058 return NULL;
5059 }
5060 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005062 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 /* wrong return value */
5066 PyErr_Format(PyExc_TypeError,
5067 "character mapping must return integer, bytes or None, not %.400s",
5068 x->ob_type->tp_name);
5069 Py_DECREF(x);
5070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
5072}
5073
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005074static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005075charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005076{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005077 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5078 /* exponentially overallocate to minimize reallocations */
5079 if (requiredsize < 2*outsize)
5080 requiredsize = 2*outsize;
5081 if (_PyBytes_Resize(outobj, requiredsize))
5082 return -1;
5083 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005084}
5085
Benjamin Peterson14339b62009-01-31 16:36:08 +00005086typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005090 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 space is available. Return a new reference to the object that
5092 was put in the output buffer, or Py_None, if the mapping was undefined
5093 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005094 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005096charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005099 PyObject *rep;
5100 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005101 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102
Christian Heimes90aa7642007-12-19 02:45:37 +00005103 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005104 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005106 if (res == -1)
5107 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 if (outsize<requiredsize)
5109 if (charmapencode_resize(outobj, outpos, requiredsize))
5110 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005111 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 outstart[(*outpos)++] = (char)res;
5113 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005114 }
5115
5116 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005119 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 Py_DECREF(rep);
5121 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005122 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 if (PyLong_Check(rep)) {
5124 Py_ssize_t requiredsize = *outpos+1;
5125 if (outsize<requiredsize)
5126 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5127 Py_DECREF(rep);
5128 return enc_EXCEPTION;
5129 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005130 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005132 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 else {
5134 const char *repchars = PyBytes_AS_STRING(rep);
5135 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5136 Py_ssize_t requiredsize = *outpos+repsize;
5137 if (outsize<requiredsize)
5138 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5139 Py_DECREF(rep);
5140 return enc_EXCEPTION;
5141 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005142 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 memcpy(outstart + *outpos, repchars, repsize);
5144 *outpos += repsize;
5145 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005147 Py_DECREF(rep);
5148 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149}
5150
5151/* handle an error in PyUnicode_EncodeCharmap
5152 Return 0 on success, -1 on error */
5153static
5154int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005155 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005157 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005158 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159{
5160 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t repsize;
5162 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 Py_UNICODE *uni2;
5164 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005165 Py_ssize_t collstartpos = *inpos;
5166 Py_ssize_t collendpos = *inpos+1;
5167 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 char *encoding = "charmap";
5169 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005170 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 /* find all unencodable characters */
5173 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005174 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005175 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 int res = encoding_map_lookup(p[collendpos], mapping);
5177 if (res != -1)
5178 break;
5179 ++collendpos;
5180 continue;
5181 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 rep = charmapencode_lookup(p[collendpos], mapping);
5184 if (rep==NULL)
5185 return -1;
5186 else if (rep!=Py_None) {
5187 Py_DECREF(rep);
5188 break;
5189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005190 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 }
5193 /* cache callback name lookup
5194 * (if not done yet, i.e. it's the first error) */
5195 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 if ((errors==NULL) || (!strcmp(errors, "strict")))
5197 *known_errorHandler = 1;
5198 else if (!strcmp(errors, "replace"))
5199 *known_errorHandler = 2;
5200 else if (!strcmp(errors, "ignore"))
5201 *known_errorHandler = 3;
5202 else if (!strcmp(errors, "xmlcharrefreplace"))
5203 *known_errorHandler = 4;
5204 else
5205 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 }
5207 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005208 case 1: /* strict */
5209 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5210 return -1;
5211 case 2: /* replace */
5212 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 x = charmapencode_output('?', mapping, res, respos);
5214 if (x==enc_EXCEPTION) {
5215 return -1;
5216 }
5217 else if (x==enc_FAILED) {
5218 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5219 return -1;
5220 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005221 }
5222 /* fall through */
5223 case 3: /* ignore */
5224 *inpos = collendpos;
5225 break;
5226 case 4: /* xmlcharrefreplace */
5227 /* generate replacement (temporarily (mis)uses p) */
5228 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 char buffer[2+29+1+1];
5230 char *cp;
5231 sprintf(buffer, "&#%d;", (int)p[collpos]);
5232 for (cp = buffer; *cp; ++cp) {
5233 x = charmapencode_output(*cp, mapping, res, respos);
5234 if (x==enc_EXCEPTION)
5235 return -1;
5236 else if (x==enc_FAILED) {
5237 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5238 return -1;
5239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005240 }
5241 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005242 *inpos = collendpos;
5243 break;
5244 default:
5245 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 encoding, reason, p, size, exceptionObject,
5247 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005248 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005250 if (PyBytes_Check(repunicode)) {
5251 /* Directly copy bytes result to output. */
5252 Py_ssize_t outsize = PyBytes_Size(*res);
5253 Py_ssize_t requiredsize;
5254 repsize = PyBytes_Size(repunicode);
5255 requiredsize = *respos + repsize;
5256 if (requiredsize > outsize)
5257 /* Make room for all additional bytes. */
5258 if (charmapencode_resize(res, respos, requiredsize)) {
5259 Py_DECREF(repunicode);
5260 return -1;
5261 }
5262 memcpy(PyBytes_AsString(*res) + *respos,
5263 PyBytes_AsString(repunicode), repsize);
5264 *respos += repsize;
5265 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005266 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005267 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 /* generate replacement */
5270 repsize = PyUnicode_GET_SIZE(repunicode);
5271 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 x = charmapencode_output(*uni2, mapping, res, respos);
5273 if (x==enc_EXCEPTION) {
5274 return -1;
5275 }
5276 else if (x==enc_FAILED) {
5277 Py_DECREF(repunicode);
5278 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5279 return -1;
5280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 }
5282 *inpos = newpos;
5283 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284 }
5285 return 0;
5286}
5287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 Py_ssize_t size,
5290 PyObject *mapping,
5291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293 /* output object */
5294 PyObject *res = NULL;
5295 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005297 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005299 PyObject *errorHandler = NULL;
5300 PyObject *exc = NULL;
5301 /* the following variable is used for caching string comparisons
5302 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5303 * 3=ignore, 4=xmlcharrefreplace */
5304 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
5306 /* Default to Latin-1 */
5307 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 /* allocate enough for a simple encoding without
5311 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005312 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 if (res == NULL)
5314 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005315 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 /* try to encode it */
5320 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5321 if (x==enc_EXCEPTION) /* error */
5322 goto onError;
5323 if (x==enc_FAILED) { /* unencodable character */
5324 if (charmap_encoding_error(p, size, &inpos, mapping,
5325 &exc,
5326 &known_errorHandler, &errorHandler, errors,
5327 &res, &respos)) {
5328 goto onError;
5329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 else
5332 /* done with this character => adjust input position */
5333 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005337 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005338 if (_PyBytes_Resize(&res, respos) < 0)
5339 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 Py_XDECREF(exc);
5342 Py_XDECREF(errorHandler);
5343 return res;
5344
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_XDECREF(res);
5347 Py_XDECREF(exc);
5348 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 return NULL;
5350}
5351
5352PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
5355 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 PyErr_BadArgument();
5357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
5359 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 PyUnicode_GET_SIZE(unicode),
5361 mapping,
5362 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363}
5364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365/* create or adjust a UnicodeTranslateError */
5366static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 const Py_UNICODE *unicode, Py_ssize_t size,
5368 Py_ssize_t startpos, Py_ssize_t endpos,
5369 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005372 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 }
5375 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5377 goto onError;
5378 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5379 goto onError;
5380 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5381 goto onError;
5382 return;
5383 onError:
5384 Py_DECREF(*exceptionObject);
5385 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
5387}
5388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389/* raises a UnicodeTranslateError */
5390static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 const Py_UNICODE *unicode, Py_ssize_t size,
5392 Py_ssize_t startpos, Py_ssize_t endpos,
5393 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394{
5395 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399}
5400
5401/* error handling callback helper:
5402 build arguments, call the callback and check the arguments,
5403 put the result into newpos and return the replacement string, which
5404 has to be freed by the caller */
5405static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 PyObject **errorHandler,
5407 const char *reason,
5408 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5409 Py_ssize_t startpos, Py_ssize_t endpos,
5410 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005412 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005414 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 PyObject *restuple;
5416 PyObject *resunicode;
5417
5418 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005420 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 }
5423
5424 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428
5429 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005434 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 Py_DECREF(restuple);
5436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 }
5438 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 &resunicode, &i_newpos)) {
5440 Py_DECREF(restuple);
5441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005443 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005445 else
5446 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005447 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5449 Py_DECREF(restuple);
5450 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 Py_INCREF(resunicode);
5453 Py_DECREF(restuple);
5454 return resunicode;
5455}
5456
5457/* Lookup the character ch in the mapping and put the result in result,
5458 which must be decrefed by the caller.
5459 Return 0 on success, -1 on error */
5460static
5461int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5462{
Christian Heimes217cfd12007-12-02 14:31:20 +00005463 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 PyObject *x;
5465
5466 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 x = PyObject_GetItem(mapping, w);
5469 Py_DECREF(w);
5470 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5472 /* No mapping found means: use 1:1 mapping. */
5473 PyErr_Clear();
5474 *result = NULL;
5475 return 0;
5476 } else
5477 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 }
5479 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 *result = x;
5481 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005483 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 long value = PyLong_AS_LONG(x);
5485 long max = PyUnicode_GetMax();
5486 if (value < 0 || value > max) {
5487 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005488 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 Py_DECREF(x);
5490 return -1;
5491 }
5492 *result = x;
5493 return 0;
5494 }
5495 else if (PyUnicode_Check(x)) {
5496 *result = x;
5497 return 0;
5498 }
5499 else {
5500 /* wrong return value */
5501 PyErr_SetString(PyExc_TypeError,
5502 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005503 Py_DECREF(x);
5504 return -1;
5505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506}
5507/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 if not reallocate and adjust various state variables.
5509 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510static
Walter Dörwald4894c302003-10-24 14:25:28 +00005511int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005514 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005515 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 /* remember old output position */
5517 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5518 /* exponentially overallocate to minimize reallocations */
5519 if (requiredsize < 2 * oldsize)
5520 requiredsize = 2 * oldsize;
5521 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5522 return -1;
5523 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 }
5525 return 0;
5526}
5527/* lookup the character, put the result in the output string and adjust
5528 various state variables. Return a new reference to the object that
5529 was put in the output buffer in *result, or Py_None, if the mapping was
5530 undefined (in which case no character was written).
5531 The called must decref result.
5532 Return 0 on success, -1 on error. */
5533static
Walter Dörwald4894c302003-10-24 14:25:28 +00005534int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5536 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537{
Walter Dörwald4894c302003-10-24 14:25:28 +00005538 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005540 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 /* not found => default to 1:1 mapping */
5542 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 }
5544 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005546 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 /* no overflow check, because we know that the space is enough */
5548 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 }
5550 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5552 if (repsize==1) {
5553 /* no overflow check, because we know that the space is enough */
5554 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5555 }
5556 else if (repsize!=0) {
5557 /* more than one character */
5558 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5559 (insize - (curinp-startinp)) +
5560 repsize - 1;
5561 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5562 return -1;
5563 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5564 *outp += repsize;
5565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 }
5567 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 return 0;
5570}
5571
5572PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 Py_ssize_t size,
5574 PyObject *mapping,
5575 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 /* output object */
5578 PyObject *res = NULL;
5579 /* pointers to the beginning and end+1 of input */
5580 const Py_UNICODE *startp = p;
5581 const Py_UNICODE *endp = p + size;
5582 /* pointer into the output */
5583 Py_UNICODE *str;
5584 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005585 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 char *reason = "character maps to <undefined>";
5587 PyObject *errorHandler = NULL;
5588 PyObject *exc = NULL;
5589 /* the following variable is used for caching string comparisons
5590 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5591 * 3=ignore, 4=xmlcharrefreplace */
5592 int known_errorHandler = -1;
5593
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 PyErr_BadArgument();
5596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598
5599 /* allocate enough for a simple 1:1 translation without
5600 replacements, if we need more, we'll resize */
5601 res = PyUnicode_FromUnicode(NULL, size);
5602 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 /* try to encode it */
5610 PyObject *x = NULL;
5611 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5612 Py_XDECREF(x);
5613 goto onError;
5614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005615 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 if (x!=Py_None) /* it worked => adjust input pointer */
5617 ++p;
5618 else { /* untranslatable character */
5619 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5620 Py_ssize_t repsize;
5621 Py_ssize_t newpos;
5622 Py_UNICODE *uni2;
5623 /* startpos for collecting untranslatable chars */
5624 const Py_UNICODE *collstart = p;
5625 const Py_UNICODE *collend = p+1;
5626 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 /* find all untranslatable characters */
5629 while (collend < endp) {
5630 if (charmaptranslate_lookup(*collend, mapping, &x))
5631 goto onError;
5632 Py_XDECREF(x);
5633 if (x!=Py_None)
5634 break;
5635 ++collend;
5636 }
5637 /* cache callback name lookup
5638 * (if not done yet, i.e. it's the first error) */
5639 if (known_errorHandler==-1) {
5640 if ((errors==NULL) || (!strcmp(errors, "strict")))
5641 known_errorHandler = 1;
5642 else if (!strcmp(errors, "replace"))
5643 known_errorHandler = 2;
5644 else if (!strcmp(errors, "ignore"))
5645 known_errorHandler = 3;
5646 else if (!strcmp(errors, "xmlcharrefreplace"))
5647 known_errorHandler = 4;
5648 else
5649 known_errorHandler = 0;
5650 }
5651 switch (known_errorHandler) {
5652 case 1: /* strict */
5653 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005654 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 case 2: /* replace */
5656 /* No need to check for space, this is a 1:1 replacement */
5657 for (coll = collstart; coll<collend; ++coll)
5658 *str++ = '?';
5659 /* fall through */
5660 case 3: /* ignore */
5661 p = collend;
5662 break;
5663 case 4: /* xmlcharrefreplace */
5664 /* generate replacement (temporarily (mis)uses p) */
5665 for (p = collstart; p < collend; ++p) {
5666 char buffer[2+29+1+1];
5667 char *cp;
5668 sprintf(buffer, "&#%d;", (int)*p);
5669 if (charmaptranslate_makespace(&res, &str,
5670 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5671 goto onError;
5672 for (cp = buffer; *cp; ++cp)
5673 *str++ = *cp;
5674 }
5675 p = collend;
5676 break;
5677 default:
5678 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5679 reason, startp, size, &exc,
5680 collstart-startp, collend-startp, &newpos);
5681 if (repunicode == NULL)
5682 goto onError;
5683 /* generate replacement */
5684 repsize = PyUnicode_GET_SIZE(repunicode);
5685 if (charmaptranslate_makespace(&res, &str,
5686 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5687 Py_DECREF(repunicode);
5688 goto onError;
5689 }
5690 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5691 *str++ = *uni2;
5692 p = startp + newpos;
5693 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 }
5696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 /* Resize if we allocated to much */
5698 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005699 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 if (PyUnicode_Resize(&res, respos) < 0)
5701 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 }
5703 Py_XDECREF(exc);
5704 Py_XDECREF(errorHandler);
5705 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 Py_XDECREF(res);
5709 Py_XDECREF(exc);
5710 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 return NULL;
5712}
5713
5714PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 PyObject *mapping,
5716 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
5718 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005719
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 str = PyUnicode_FromObject(str);
5721 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 PyUnicode_GET_SIZE(str),
5725 mapping,
5726 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 Py_DECREF(str);
5728 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 Py_XDECREF(str);
5732 return NULL;
5733}
Tim Petersced69f82003-09-16 20:30:58 +00005734
Guido van Rossum9e896b32000-04-05 20:11:21 +00005735/* --- Decimal Encoder ---------------------------------------------------- */
5736
5737int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 Py_ssize_t length,
5739 char *output,
5740 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005741{
5742 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 PyObject *errorHandler = NULL;
5744 PyObject *exc = NULL;
5745 const char *encoding = "decimal";
5746 const char *reason = "invalid decimal Unicode string";
5747 /* the following variable is used for caching string comparisons
5748 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5749 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005750
5751 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 PyErr_BadArgument();
5753 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005754 }
5755
5756 p = s;
5757 end = s + length;
5758 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 register Py_UNICODE ch = *p;
5760 int decimal;
5761 PyObject *repunicode;
5762 Py_ssize_t repsize;
5763 Py_ssize_t newpos;
5764 Py_UNICODE *uni2;
5765 Py_UNICODE *collstart;
5766 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005769 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 ++p;
5771 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005772 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 decimal = Py_UNICODE_TODECIMAL(ch);
5774 if (decimal >= 0) {
5775 *output++ = '0' + decimal;
5776 ++p;
5777 continue;
5778 }
5779 if (0 < ch && ch < 256) {
5780 *output++ = (char)ch;
5781 ++p;
5782 continue;
5783 }
5784 /* All other characters are considered unencodable */
5785 collstart = p;
5786 collend = p+1;
5787 while (collend < end) {
5788 if ((0 < *collend && *collend < 256) ||
5789 !Py_UNICODE_ISSPACE(*collend) ||
5790 Py_UNICODE_TODECIMAL(*collend))
5791 break;
5792 }
5793 /* cache callback name lookup
5794 * (if not done yet, i.e. it's the first error) */
5795 if (known_errorHandler==-1) {
5796 if ((errors==NULL) || (!strcmp(errors, "strict")))
5797 known_errorHandler = 1;
5798 else if (!strcmp(errors, "replace"))
5799 known_errorHandler = 2;
5800 else if (!strcmp(errors, "ignore"))
5801 known_errorHandler = 3;
5802 else if (!strcmp(errors, "xmlcharrefreplace"))
5803 known_errorHandler = 4;
5804 else
5805 known_errorHandler = 0;
5806 }
5807 switch (known_errorHandler) {
5808 case 1: /* strict */
5809 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5810 goto onError;
5811 case 2: /* replace */
5812 for (p = collstart; p < collend; ++p)
5813 *output++ = '?';
5814 /* fall through */
5815 case 3: /* ignore */
5816 p = collend;
5817 break;
5818 case 4: /* xmlcharrefreplace */
5819 /* generate replacement (temporarily (mis)uses p) */
5820 for (p = collstart; p < collend; ++p)
5821 output += sprintf(output, "&#%d;", (int)*p);
5822 p = collend;
5823 break;
5824 default:
5825 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5826 encoding, reason, s, length, &exc,
5827 collstart-s, collend-s, &newpos);
5828 if (repunicode == NULL)
5829 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005830 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005831 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005832 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5833 Py_DECREF(repunicode);
5834 goto onError;
5835 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 /* generate replacement */
5837 repsize = PyUnicode_GET_SIZE(repunicode);
5838 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5839 Py_UNICODE ch = *uni2;
5840 if (Py_UNICODE_ISSPACE(ch))
5841 *output++ = ' ';
5842 else {
5843 decimal = Py_UNICODE_TODECIMAL(ch);
5844 if (decimal >= 0)
5845 *output++ = '0' + decimal;
5846 else if (0 < ch && ch < 256)
5847 *output++ = (char)ch;
5848 else {
5849 Py_DECREF(repunicode);
5850 raise_encode_exception(&exc, encoding,
5851 s, length, collstart-s, collend-s, reason);
5852 goto onError;
5853 }
5854 }
5855 }
5856 p = s + newpos;
5857 Py_DECREF(repunicode);
5858 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005859 }
5860 /* 0-terminate the output string */
5861 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(exc);
5863 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005864 return 0;
5865
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_XDECREF(exc);
5868 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005869 return -1;
5870}
5871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872/* --- Helpers ------------------------------------------------------------ */
5873
Eric Smith8c663262007-08-25 02:26:07 +00005874#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005875#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005877/* Include _ParseTupleFinds from find.h */
5878#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879#include "stringlib/find.h"
5880#include "stringlib/partition.h"
5881
Eric Smith5807c412008-05-11 21:00:57 +00005882#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005883#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005884#include "stringlib/localeutil.h"
5885
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886/* helper macro to fixup start/end slice values */
5887#define FIX_START_END(obj) \
5888 if (start < 0) \
5889 start += (obj)->length; \
5890 if (start < 0) \
5891 start = 0; \
5892 if (end > (obj)->length) \
5893 end = (obj)->length; \
5894 if (end < 0) \
5895 end += (obj)->length; \
5896 if (end < 0) \
5897 end = 0;
5898
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005900 PyObject *substr,
5901 Py_ssize_t start,
5902 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005904 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 PyUnicodeObject* str_obj;
5906 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005907
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5909 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005911 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5912 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 Py_DECREF(str_obj);
5914 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
Tim Petersced69f82003-09-16 20:30:58 +00005916
Thomas Wouters477c8d52006-05-27 19:21:47 +00005917 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005918
Thomas Wouters477c8d52006-05-27 19:21:47 +00005919 result = stringlib_count(
5920 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5921 );
5922
5923 Py_DECREF(sub_obj);
5924 Py_DECREF(str_obj);
5925
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 return result;
5927}
5928
Martin v. Löwis18e16552006-02-15 17:27:45 +00005929Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005930 PyObject *sub,
5931 Py_ssize_t start,
5932 Py_ssize_t end,
5933 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005935 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005938 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005940 sub = PyUnicode_FromObject(sub);
5941 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 Py_DECREF(str);
5943 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
Tim Petersced69f82003-09-16 20:30:58 +00005945
Thomas Wouters477c8d52006-05-27 19:21:47 +00005946 if (direction > 0)
5947 result = stringlib_find_slice(
5948 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5949 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5950 start, end
5951 );
5952 else
5953 result = stringlib_rfind_slice(
5954 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5955 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5956 start, end
5957 );
5958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960 Py_DECREF(sub);
5961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 return result;
5963}
5964
Tim Petersced69f82003-09-16 20:30:58 +00005965static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 PyUnicodeObject *substring,
5968 Py_ssize_t start,
5969 Py_ssize_t end,
5970 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 if (substring->length == 0)
5973 return 1;
5974
Thomas Wouters477c8d52006-05-27 19:21:47 +00005975 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977 end -= substring->length;
5978 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
5981 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 if (Py_UNICODE_MATCH(self, end, substring))
5983 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 } else {
5985 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
5988
5989 return 0;
5990}
5991
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 PyObject *substr,
5994 Py_ssize_t start,
5995 Py_ssize_t end,
5996 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005998 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 str = PyUnicode_FromObject(str);
6001 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 substr = PyUnicode_FromObject(substr);
6004 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_DECREF(str);
6006 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 }
Tim Petersced69f82003-09-16 20:30:58 +00006008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 (PyUnicodeObject *)substr,
6011 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 Py_DECREF(str);
6013 Py_DECREF(substr);
6014 return result;
6015}
6016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017/* Apply fixfct filter to the Unicode object self and return a
6018 reference to the modified object */
6019
Tim Petersced69f82003-09-16 20:30:58 +00006020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
6024
6025 PyUnicodeObject *u;
6026
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006027 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006030
6031 Py_UNICODE_COPY(u->str, self->str, self->length);
6032
Tim Peters7a29bd52001-09-12 03:03:31 +00006033 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* fixfct should return TRUE if it modified the buffer. If
6035 FALSE, return a reference to the original buffer instead
6036 (to save space, not time) */
6037 Py_INCREF(self);
6038 Py_DECREF(u);
6039 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 }
6041 return (PyObject*) u;
6042}
6043
Tim Petersced69f82003-09-16 20:30:58 +00006044static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045int fixupper(PyUnicodeObject *self)
6046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 Py_UNICODE *s = self->str;
6049 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006053
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 ch = Py_UNICODE_TOUPPER(*s);
6055 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 *s = ch;
6058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 s++;
6060 }
6061
6062 return status;
6063}
6064
Tim Petersced69f82003-09-16 20:30:58 +00006065static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066int fixlower(PyUnicodeObject *self)
6067{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006068 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 Py_UNICODE *s = self->str;
6070 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 ch = Py_UNICODE_TOLOWER(*s);
6076 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 *s = ch;
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 s++;
6081 }
6082
6083 return status;
6084}
6085
Tim Petersced69f82003-09-16 20:30:58 +00006086static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087int fixswapcase(PyUnicodeObject *self)
6088{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006089 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 Py_UNICODE *s = self->str;
6091 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006092
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 while (len-- > 0) {
6094 if (Py_UNICODE_ISUPPER(*s)) {
6095 *s = Py_UNICODE_TOLOWER(*s);
6096 status = 1;
6097 } else if (Py_UNICODE_ISLOWER(*s)) {
6098 *s = Py_UNICODE_TOUPPER(*s);
6099 status = 1;
6100 }
6101 s++;
6102 }
6103
6104 return status;
6105}
6106
Tim Petersced69f82003-09-16 20:30:58 +00006107static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108int fixcapitalize(PyUnicodeObject *self)
6109{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006111 Py_UNICODE *s = self->str;
6112 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006113
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006114 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006116 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 *s = Py_UNICODE_TOUPPER(*s);
6118 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006120 s++;
6121 while (--len > 0) {
6122 if (Py_UNICODE_ISUPPER(*s)) {
6123 *s = Py_UNICODE_TOLOWER(*s);
6124 status = 1;
6125 }
6126 s++;
6127 }
6128 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129}
6130
6131static
6132int fixtitle(PyUnicodeObject *self)
6133{
6134 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6135 register Py_UNICODE *e;
6136 int previous_is_cased;
6137
6138 /* Shortcut for single character strings */
6139 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6141 if (*p != ch) {
6142 *p = ch;
6143 return 1;
6144 }
6145 else
6146 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
Tim Petersced69f82003-09-16 20:30:58 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 e = p + PyUnicode_GET_SIZE(self);
6150 previous_is_cased = 0;
6151 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006153
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 if (previous_is_cased)
6155 *p = Py_UNICODE_TOLOWER(ch);
6156 else
6157 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006158
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 if (Py_UNICODE_ISLOWER(ch) ||
6160 Py_UNICODE_ISUPPER(ch) ||
6161 Py_UNICODE_ISTITLE(ch))
6162 previous_is_cased = 1;
6163 else
6164 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 }
6166 return 1;
6167}
6168
Tim Peters8ce9f162004-08-27 01:49:32 +00006169PyObject *
6170PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Skip Montanaro6543b452004-09-16 03:28:13 +00006172 const Py_UNICODE blank = ' ';
6173 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006174 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006175 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006176 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6177 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006178 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6179 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006180 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006181 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
Tim Peters05eba1f2004-08-27 21:32:02 +00006183 fseq = PySequence_Fast(seq, "");
6184 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006185 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006186 }
6187
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006188 /* NOTE: the following code can't call back into Python code,
6189 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006190 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006191
Tim Peters05eba1f2004-08-27 21:32:02 +00006192 seqlen = PySequence_Fast_GET_SIZE(fseq);
6193 /* If empty sequence, return u"". */
6194 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006195 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6196 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006197 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006198 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006199 /* If singleton sequence with an exact Unicode, return that. */
6200 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 item = items[0];
6202 if (PyUnicode_CheckExact(item)) {
6203 Py_INCREF(item);
6204 res = (PyUnicodeObject *)item;
6205 goto Done;
6206 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006207 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006208 else {
6209 /* Set up sep and seplen */
6210 if (separator == NULL) {
6211 sep = &blank;
6212 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006213 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006214 else {
6215 if (!PyUnicode_Check(separator)) {
6216 PyErr_Format(PyExc_TypeError,
6217 "separator: expected str instance,"
6218 " %.80s found",
6219 Py_TYPE(separator)->tp_name);
6220 goto onError;
6221 }
6222 sep = PyUnicode_AS_UNICODE(separator);
6223 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006224 }
6225 }
6226
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006227 /* There are at least two things to join, or else we have a subclass
6228 * of str in the sequence.
6229 * Do a pre-pass to figure out the total amount of space we'll
6230 * need (sz), and see whether all argument are strings.
6231 */
6232 sz = 0;
6233 for (i = 0; i < seqlen; i++) {
6234 const Py_ssize_t old_sz = sz;
6235 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 if (!PyUnicode_Check(item)) {
6237 PyErr_Format(PyExc_TypeError,
6238 "sequence item %zd: expected str instance,"
6239 " %.80s found",
6240 i, Py_TYPE(item)->tp_name);
6241 goto onError;
6242 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006243 sz += PyUnicode_GET_SIZE(item);
6244 if (i != 0)
6245 sz += seplen;
6246 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6247 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006249 goto onError;
6250 }
6251 }
Tim Petersced69f82003-09-16 20:30:58 +00006252
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006253 res = _PyUnicode_New(sz);
6254 if (res == NULL)
6255 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006256
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006257 /* Catenate everything. */
6258 res_p = PyUnicode_AS_UNICODE(res);
6259 for (i = 0; i < seqlen; ++i) {
6260 Py_ssize_t itemlen;
6261 item = items[i];
6262 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 /* Copy item, and maybe the separator. */
6264 if (i) {
6265 Py_UNICODE_COPY(res_p, sep, seplen);
6266 res_p += seplen;
6267 }
6268 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6269 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006270 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006271
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006273 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return (PyObject *)res;
6275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006277 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006278 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 return NULL;
6280}
6281
Tim Petersced69f82003-09-16 20:30:58 +00006282static
6283PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 Py_ssize_t left,
6285 Py_ssize_t right,
6286 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287{
6288 PyUnicodeObject *u;
6289
6290 if (left < 0)
6291 left = 0;
6292 if (right < 0)
6293 right = 0;
6294
Tim Peters7a29bd52001-09-12 03:03:31 +00006295 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 Py_INCREF(self);
6297 return self;
6298 }
6299
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006300 if (left > PY_SSIZE_T_MAX - self->length ||
6301 right > PY_SSIZE_T_MAX - (left + self->length)) {
6302 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6303 return NULL;
6304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 u = _PyUnicode_New(left + self->length + right);
6306 if (u) {
6307 if (left)
6308 Py_UNICODE_FILL(u->str, fill, left);
6309 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6310 if (right)
6311 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6312 }
6313
6314 return u;
6315}
6316
Benjamin Peterson29060642009-01-31 22:14:21 +00006317#define SPLIT_APPEND(data, left, right) \
6318 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6319 if (!str) \
6320 goto onError; \
6321 if (PyList_Append(list, str)) { \
6322 Py_DECREF(str); \
6323 goto onError; \
6324 } \
6325 else \
6326 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328static
6329PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 PyObject *list,
6331 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 register Py_ssize_t i;
6334 register Py_ssize_t j;
6335 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006337 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
6339 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006341 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006343 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6345 i++;
6346 if (j < i) {
6347 if (maxcount-- <= 0)
6348 break;
6349 SPLIT_APPEND(buf, j, i);
6350 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6351 i++;
6352 j = i;
6353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 }
6355 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
6358 return list;
6359
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 Py_DECREF(list);
6362 return NULL;
6363}
6364
6365PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006368 register Py_ssize_t i;
6369 register Py_ssize_t j;
6370 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 PyObject *list;
6372 PyObject *str;
6373 Py_UNICODE *data;
6374
6375 string = PyUnicode_FromObject(string);
6376 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 data = PyUnicode_AS_UNICODE(string);
6379 len = PyUnicode_GET_SIZE(string);
6380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 list = PyList_New(0);
6382 if (!list)
6383 goto onError;
6384
6385 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 /* Find a line and append it */
6389 while (i < len && !BLOOM_LINEBREAK(data[i]))
6390 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006393 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 if (i < len) {
6395 if (data[i] == '\r' && i + 1 < len &&
6396 data[i+1] == '\n')
6397 i += 2;
6398 else
6399 i++;
6400 if (keepends)
6401 eol = i;
6402 }
6403 SPLIT_APPEND(data, j, eol);
6404 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 }
6406 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
6409
6410 Py_DECREF(string);
6411 return list;
6412
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006414 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 Py_DECREF(string);
6416 return NULL;
6417}
6418
Tim Petersced69f82003-09-16 20:30:58 +00006419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 PyObject *list,
6422 Py_UNICODE ch,
6423 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006425 register Py_ssize_t i;
6426 register Py_ssize_t j;
6427 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006429 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
6431 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 if (buf[i] == ch) {
6433 if (maxcount-- <= 0)
6434 break;
6435 SPLIT_APPEND(buf, j, i);
6436 i = j = i + 1;
6437 } else
6438 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
6440 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
6443 return list;
6444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 Py_DECREF(list);
6447 return NULL;
6448}
6449
Tim Petersced69f82003-09-16 20:30:58 +00006450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 PyObject *list,
6453 PyUnicodeObject *substring,
6454 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 register Py_ssize_t i;
6457 register Py_ssize_t j;
6458 Py_ssize_t len = self->length;
6459 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 PyObject *str;
6461
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006462 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 if (Py_UNICODE_MATCH(self, i, substring)) {
6464 if (maxcount-- <= 0)
6465 break;
6466 SPLIT_APPEND(self->str, j, i);
6467 i = j = i + sublen;
6468 } else
6469 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 }
6471 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 }
6474 return list;
6475
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 Py_DECREF(list);
6478 return NULL;
6479}
6480
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006481static
6482PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 PyObject *list,
6484 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006485{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006486 register Py_ssize_t i;
6487 register Py_ssize_t j;
6488 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006489 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006490 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006491
6492 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006496 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6498 i--;
6499 if (j > i) {
6500 if (maxcount-- <= 0)
6501 break;
6502 SPLIT_APPEND(buf, i + 1, j + 1);
6503 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6504 i--;
6505 j = i;
6506 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006507 }
6508 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 if (PyList_Reverse(list) < 0)
6512 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006513 return list;
6514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006516 Py_DECREF(list);
6517 return NULL;
6518}
6519
Benjamin Peterson14339b62009-01-31 16:36:08 +00006520static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006521PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 PyObject *list,
6523 Py_UNICODE ch,
6524 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526 register Py_ssize_t i;
6527 register Py_ssize_t j;
6528 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006529 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006530 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006531
6532 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 if (buf[i] == ch) {
6534 if (maxcount-- <= 0)
6535 break;
6536 SPLIT_APPEND(buf, i + 1, j + 1);
6537 j = i = i - 1;
6538 } else
6539 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006540 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006541 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006544 if (PyList_Reverse(list) < 0)
6545 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006546 return list;
6547
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006549 Py_DECREF(list);
6550 return NULL;
6551}
6552
Benjamin Peterson14339b62009-01-31 16:36:08 +00006553static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006554PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 PyObject *list,
6556 PyUnicodeObject *substring,
6557 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006558{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006559 register Py_ssize_t i;
6560 register Py_ssize_t j;
6561 Py_ssize_t len = self->length;
6562 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006563 PyObject *str;
6564
6565 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 if (Py_UNICODE_MATCH(self, i, substring)) {
6567 if (maxcount-- <= 0)
6568 break;
6569 SPLIT_APPEND(self->str, i + sublen, j);
6570 j = i;
6571 i -= sublen;
6572 } else
6573 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006574 }
6575 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006577 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 if (PyList_Reverse(list) < 0)
6579 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006580 return list;
6581
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006583 Py_DECREF(list);
6584 return NULL;
6585}
6586
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587#undef SPLIT_APPEND
6588
6589static
6590PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 PyUnicodeObject *substring,
6592 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593{
6594 PyObject *list;
6595
6596 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006597 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
6599 list = PyList_New(0);
6600 if (!list)
6601 return NULL;
6602
6603 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
6606 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
6609 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(list);
6611 PyErr_SetString(PyExc_ValueError, "empty separator");
6612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
6614 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616}
6617
Tim Petersced69f82003-09-16 20:30:58 +00006618static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006619PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 PyUnicodeObject *substring,
6621 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006622{
6623 PyObject *list;
6624
6625 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006626 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006627
6628 list = PyList_New(0);
6629 if (!list)
6630 return NULL;
6631
6632 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006634
6635 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006637
6638 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 Py_DECREF(list);
6640 PyErr_SetString(PyExc_ValueError, "empty separator");
6641 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006642 }
6643 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006645}
6646
6647static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 PyUnicodeObject *str1,
6650 PyUnicodeObject *str2,
6651 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
6653 PyUnicodeObject *u;
6654
6655 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Thomas Wouters477c8d52006-05-27 19:21:47 +00006658 if (str1->length == str2->length) {
6659 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006660 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006661 if (str1->length == 1) {
6662 /* replace characters */
6663 Py_UNICODE u1, u2;
6664 if (!findchar(self->str, self->length, str1->str[0]))
6665 goto nothing;
6666 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6667 if (!u)
6668 return NULL;
6669 Py_UNICODE_COPY(u->str, self->str, self->length);
6670 u1 = str1->str[0];
6671 u2 = str2->str[0];
6672 for (i = 0; i < u->length; i++)
6673 if (u->str[i] == u1) {
6674 if (--maxcount < 0)
6675 break;
6676 u->str[i] = u2;
6677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006679 i = fastsearch(
6680 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 if (i < 0)
6683 goto nothing;
6684 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6685 if (!u)
6686 return NULL;
6687 Py_UNICODE_COPY(u->str, self->str, self->length);
6688 while (i <= self->length - str1->length)
6689 if (Py_UNICODE_MATCH(self, i, str1)) {
6690 if (--maxcount < 0)
6691 break;
6692 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6693 i += str1->length;
6694 } else
6695 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006698
6699 Py_ssize_t n, i, j, e;
6700 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 Py_UNICODE *p;
6702
6703 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006704 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 if (n > maxcount)
6706 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006707 if (n == 0)
6708 goto nothing;
6709 /* new_size = self->length + n * (str2->length - str1->length)); */
6710 delta = (str2->length - str1->length);
6711 if (delta == 0) {
6712 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006714 product = n * (str2->length - str1->length);
6715 if ((product / (str2->length - str1->length)) != n) {
6716 PyErr_SetString(PyExc_OverflowError,
6717 "replace string is too long");
6718 return NULL;
6719 }
6720 new_size = self->length + product;
6721 if (new_size < 0) {
6722 PyErr_SetString(PyExc_OverflowError,
6723 "replace string is too long");
6724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 }
6726 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006727 u = _PyUnicode_New(new_size);
6728 if (!u)
6729 return NULL;
6730 i = 0;
6731 p = u->str;
6732 e = self->length - str1->length;
6733 if (str1->length > 0) {
6734 while (n-- > 0) {
6735 /* look for next match */
6736 j = i;
6737 while (j <= e) {
6738 if (Py_UNICODE_MATCH(self, j, str1))
6739 break;
6740 j++;
6741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006743 if (j > e)
6744 break;
6745 /* copy unchanged part [i:j] */
6746 Py_UNICODE_COPY(p, self->str+i, j-i);
6747 p += j - i;
6748 }
6749 /* copy substitution string */
6750 if (str2->length > 0) {
6751 Py_UNICODE_COPY(p, str2->str, str2->length);
6752 p += str2->length;
6753 }
6754 i = j + str1->length;
6755 }
6756 if (i < self->length)
6757 /* copy tail [i:] */
6758 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6759 } else {
6760 /* interleave */
6761 while (n > 0) {
6762 Py_UNICODE_COPY(p, str2->str, str2->length);
6763 p += str2->length;
6764 if (--n <= 0)
6765 break;
6766 *p++ = self->str[i++];
6767 }
6768 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006774 /* nothing to replace; return original string (when possible) */
6775 if (PyUnicode_CheckExact(self)) {
6776 Py_INCREF(self);
6777 return (PyObject *) self;
6778 }
6779 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
6782/* --- Unicode Object Methods --------------------------------------------- */
6783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786\n\
6787Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
6790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006791unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 return fixup(self, fixtitle);
6794}
6795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006796PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798\n\
6799Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
6802static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006803unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 return fixup(self, fixcapitalize);
6806}
6807
6808#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811\n\
6812Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
6815static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006816unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
6818 PyObject *list;
6819 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006820 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 /* Split into words */
6823 list = split(self, NULL, -1);
6824 if (!list)
6825 return NULL;
6826
6827 /* Capitalize each word */
6828 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6829 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 if (item == NULL)
6832 goto onError;
6833 Py_DECREF(PyList_GET_ITEM(list, i));
6834 PyList_SET_ITEM(list, i, item);
6835 }
6836
6837 /* Join the words to form a new string */
6838 item = PyUnicode_Join(NULL, list);
6839
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 Py_DECREF(list);
6842 return (PyObject *)item;
6843}
6844#endif
6845
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006846/* Argument converter. Coerces to a single unicode character */
6847
6848static int
6849convert_uc(PyObject *obj, void *addr)
6850{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006851 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6852 PyObject *uniobj;
6853 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006854
Benjamin Peterson14339b62009-01-31 16:36:08 +00006855 uniobj = PyUnicode_FromObject(obj);
6856 if (uniobj == NULL) {
6857 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006859 return 0;
6860 }
6861 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6862 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006864 Py_DECREF(uniobj);
6865 return 0;
6866 }
6867 unistr = PyUnicode_AS_UNICODE(uniobj);
6868 *fillcharloc = unistr[0];
6869 Py_DECREF(uniobj);
6870 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006871}
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006876Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006877done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
6879static PyObject *
6880unicode_center(PyUnicodeObject *self, PyObject *args)
6881{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006882 Py_ssize_t marg, left;
6883 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006884 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
Thomas Woutersde017742006-02-16 19:34:37 +00006886 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 return NULL;
6888
Tim Peters7a29bd52001-09-12 03:03:31 +00006889 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 Py_INCREF(self);
6891 return (PyObject*) self;
6892 }
6893
6894 marg = width - self->length;
6895 left = marg / 2 + (marg & width & 1);
6896
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006897 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Marc-André Lemburge5034372000-08-08 08:04:29 +00006900#if 0
6901
6902/* This code should go into some future Unicode collation support
6903 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006904 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006905
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006906/* speedy UTF-16 code point order comparison */
6907/* gleaned from: */
6908/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6909
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006910static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006911{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006912 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006913 0, 0, 0, 0, 0, 0, 0, 0,
6914 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006915 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006916};
6917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918static int
6919unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6920{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006922
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 Py_UNICODE *s1 = str1->str;
6924 Py_UNICODE *s2 = str2->str;
6925
6926 len1 = str1->length;
6927 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006930 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006931
6932 c1 = *s1++;
6933 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006934
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 if (c1 > (1<<11) * 26)
6936 c1 += utf16Fixup[c1>>11];
6937 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006938 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006939 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006940
6941 if (c1 != c2)
6942 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006943
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006944 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
6946
6947 return (len1 < len2) ? -1 : (len1 != len2);
6948}
6949
Marc-André Lemburge5034372000-08-08 08:04:29 +00006950#else
6951
6952static int
6953unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006955 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006956
6957 Py_UNICODE *s1 = str1->str;
6958 Py_UNICODE *s2 = str2->str;
6959
6960 len1 = str1->length;
6961 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006962
Marc-André Lemburge5034372000-08-08 08:04:29 +00006963 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006964 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006965
Fredrik Lundh45714e92001-06-26 16:39:36 +00006966 c1 = *s1++;
6967 c2 = *s2++;
6968
6969 if (c1 != c2)
6970 return (c1 < c2) ? -1 : 1;
6971
Marc-André Lemburge5034372000-08-08 08:04:29 +00006972 len1--; len2--;
6973 }
6974
6975 return (len1 < len2) ? -1 : (len1 != len2);
6976}
6977
6978#endif
6979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006983 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6984 return unicode_compare((PyUnicodeObject *)left,
6985 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006986 PyErr_Format(PyExc_TypeError,
6987 "Can't compare %.100s and %.100s",
6988 left->ob_type->tp_name,
6989 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 return -1;
6991}
6992
Martin v. Löwis5b222132007-06-10 09:51:05 +00006993int
6994PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6995{
6996 int i;
6997 Py_UNICODE *id;
6998 assert(PyUnicode_Check(uni));
6999 id = PyUnicode_AS_UNICODE(uni);
7000 /* Compare Unicode string and source character set string */
7001 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 if (id[i] != str[i])
7003 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00007004 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007006 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007008 return 0;
7009}
7010
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007011
Benjamin Peterson29060642009-01-31 22:14:21 +00007012#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007013 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007014
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007015PyObject *PyUnicode_RichCompare(PyObject *left,
7016 PyObject *right,
7017 int op)
7018{
7019 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007020
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007021 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7022 PyObject *v;
7023 if (((PyUnicodeObject *) left)->length !=
7024 ((PyUnicodeObject *) right)->length) {
7025 if (op == Py_EQ) {
7026 Py_INCREF(Py_False);
7027 return Py_False;
7028 }
7029 if (op == Py_NE) {
7030 Py_INCREF(Py_True);
7031 return Py_True;
7032 }
7033 }
7034 if (left == right)
7035 result = 0;
7036 else
7037 result = unicode_compare((PyUnicodeObject *)left,
7038 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007039
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007040 /* Convert the return value to a Boolean */
7041 switch (op) {
7042 case Py_EQ:
7043 v = TEST_COND(result == 0);
7044 break;
7045 case Py_NE:
7046 v = TEST_COND(result != 0);
7047 break;
7048 case Py_LE:
7049 v = TEST_COND(result <= 0);
7050 break;
7051 case Py_GE:
7052 v = TEST_COND(result >= 0);
7053 break;
7054 case Py_LT:
7055 v = TEST_COND(result == -1);
7056 break;
7057 case Py_GT:
7058 v = TEST_COND(result == 1);
7059 break;
7060 default:
7061 PyErr_BadArgument();
7062 return NULL;
7063 }
7064 Py_INCREF(v);
7065 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007066 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007067
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007068 Py_INCREF(Py_NotImplemented);
7069 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007070}
7071
Guido van Rossum403d68b2000-03-13 15:55:09 +00007072int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007074{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007075 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007076 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007077
7078 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007079 sub = PyUnicode_FromObject(element);
7080 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 PyErr_Format(PyExc_TypeError,
7082 "'in <string>' requires string as left operand, not %s",
7083 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007084 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007085 }
7086
Thomas Wouters477c8d52006-05-27 19:21:47 +00007087 str = PyUnicode_FromObject(container);
7088 if (!str) {
7089 Py_DECREF(sub);
7090 return -1;
7091 }
7092
7093 result = stringlib_contains_obj(str, sub);
7094
7095 Py_DECREF(str);
7096 Py_DECREF(sub);
7097
Guido van Rossum403d68b2000-03-13 15:55:09 +00007098 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007099}
7100
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101/* Concat to string or Unicode object giving a new Unicode object. */
7102
7103PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105{
7106 PyUnicodeObject *u = NULL, *v = NULL, *w;
7107
7108 /* Coerce the two arguments */
7109 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7110 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7113 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115
7116 /* Shortcuts */
7117 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 Py_DECREF(v);
7119 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 }
7121 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 Py_DECREF(u);
7123 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 }
7125
7126 /* Concat the two Unicode strings */
7127 w = _PyUnicode_New(u->length + v->length);
7128 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 Py_UNICODE_COPY(w->str, u->str, u->length);
7131 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7132
7133 Py_DECREF(u);
7134 Py_DECREF(v);
7135 return (PyObject *)w;
7136
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 Py_XDECREF(u);
7139 Py_XDECREF(v);
7140 return NULL;
7141}
7142
Walter Dörwald1ab83302007-05-18 17:15:44 +00007143void
7144PyUnicode_Append(PyObject **pleft, PyObject *right)
7145{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007146 PyObject *new;
7147 if (*pleft == NULL)
7148 return;
7149 if (right == NULL || !PyUnicode_Check(*pleft)) {
7150 Py_DECREF(*pleft);
7151 *pleft = NULL;
7152 return;
7153 }
7154 new = PyUnicode_Concat(*pleft, right);
7155 Py_DECREF(*pleft);
7156 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007157}
7158
7159void
7160PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007162 PyUnicode_Append(pleft, right);
7163 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007164}
7165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007166PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007169Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007170string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007171interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172
7173static PyObject *
7174unicode_count(PyUnicodeObject *self, PyObject *args)
7175{
7176 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007177 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007178 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 PyObject *result;
7180
Guido van Rossumb8872e62000-05-09 14:14:27 +00007181 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 return NULL;
7184
7185 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007186 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007189
Thomas Wouters477c8d52006-05-27 19:21:47 +00007190 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
Christian Heimes217cfd12007-12-02 14:31:20 +00007192 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007193 stringlib_count(self->str + start, end - start,
7194 substring->str, substring->length)
7195 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007198
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 return result;
7200}
7201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007202PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007205Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007206to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007207handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7209'xmlcharrefreplace' as well as any other name registered with\n\
7210codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007213unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007215 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 char *encoding = NULL;
7217 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007218 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007219
Benjamin Peterson308d6372009-09-18 21:42:35 +00007220 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7221 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007223 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007224 if (v == NULL)
7225 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007226 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007227 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007228 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007229 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007230 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007231 Py_DECREF(v);
7232 return NULL;
7233 }
7234 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007237 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007238}
7239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007240PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242\n\
7243Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
7246static PyObject*
7247unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7248{
7249 Py_UNICODE *e;
7250 Py_UNICODE *p;
7251 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007252 Py_UNICODE *qe;
7253 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 PyUnicodeObject *u;
7255 int tabsize = 8;
7256
7257 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
Thomas Wouters7e474022000-07-16 12:04:32 +00007260 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007261 i = 0; /* chars up to and including most recent \n or \r */
7262 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7263 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 for (p = self->str; p < e; p++)
7265 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 if (tabsize > 0) {
7267 incr = tabsize - (j % tabsize); /* cannot overflow */
7268 if (j > PY_SSIZE_T_MAX - incr)
7269 goto overflow1;
7270 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007271 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 if (j > PY_SSIZE_T_MAX - 1)
7275 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 j++;
7277 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 if (i > PY_SSIZE_T_MAX - j)
7279 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007281 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 }
7283 }
7284
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007285 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007287
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 /* Second pass: create output string and fill it */
7289 u = _PyUnicode_New(i + j);
7290 if (!u)
7291 return NULL;
7292
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007293 j = 0; /* same as in first pass */
7294 q = u->str; /* next output char */
7295 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
7297 for (p = self->str; p < e; p++)
7298 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 if (tabsize > 0) {
7300 i = tabsize - (j % tabsize);
7301 j += i;
7302 while (i--) {
7303 if (q >= qe)
7304 goto overflow2;
7305 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007308 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 else {
7310 if (q >= qe)
7311 goto overflow2;
7312 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007313 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 if (*p == '\n' || *p == '\r')
7315 j = 0;
7316 }
7317
7318 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007319
7320 overflow2:
7321 Py_DECREF(u);
7322 overflow1:
7323 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325}
7326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007327PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329\n\
7330Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007331such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332arguments start and end are interpreted as in slice notation.\n\
7333\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007334Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
7336static PyObject *
7337unicode_find(PyUnicodeObject *self, PyObject *args)
7338{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007340 Py_ssize_t start;
7341 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007342 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
Christian Heimes9cd17752007-11-18 19:35:23 +00007344 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347 result = stringlib_find_slice(
7348 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7349 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7350 start, end
7351 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
7353 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007354
Christian Heimes217cfd12007-12-02 14:31:20 +00007355 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356}
7357
7358static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360{
7361 if (index < 0 || index >= self->length) {
7362 PyErr_SetString(PyExc_IndexError, "string index out of range");
7363 return NULL;
7364 }
7365
7366 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7367}
7368
Guido van Rossumc2504932007-09-18 19:42:40 +00007369/* Believe it or not, this produces the same value for ASCII strings
7370 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007372unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373{
Guido van Rossumc2504932007-09-18 19:42:40 +00007374 Py_ssize_t len;
7375 Py_UNICODE *p;
7376 long x;
7377
7378 if (self->hash != -1)
7379 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007380 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007381 p = self->str;
7382 x = *p << 7;
7383 while (--len >= 0)
7384 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007385 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007386 if (x == -1)
7387 x = -2;
7388 self->hash = x;
7389 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
7397static PyObject *
7398unicode_index(PyUnicodeObject *self, PyObject *args)
7399{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007400 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007401 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007402 Py_ssize_t start;
7403 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
Christian Heimes9cd17752007-11-18 19:35:23 +00007405 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
Thomas Wouters477c8d52006-05-27 19:21:47 +00007408 result = stringlib_find_slice(
7409 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7410 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7411 start, end
7412 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 if (result < 0) {
7417 PyErr_SetString(PyExc_ValueError, "substring not found");
7418 return NULL;
7419 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007420
Christian Heimes217cfd12007-12-02 14:31:20 +00007421 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007427Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007431unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432{
7433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7434 register const Py_UNICODE *e;
7435 int cased;
7436
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 /* Shortcut for single character strings */
7438 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007442 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 e = p + PyUnicode_GET_SIZE(self);
7446 cased = 0;
7447 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7451 return PyBool_FromLong(0);
7452 else if (!cased && Py_UNICODE_ISLOWER(ch))
7453 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007455 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007461Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007465unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466{
7467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7468 register const Py_UNICODE *e;
7469 int cased;
7470
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 /* Shortcut for single character strings */
7472 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007475 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007476 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 e = p + PyUnicode_GET_SIZE(self);
7480 cased = 0;
7481 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007483
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7485 return PyBool_FromLong(0);
7486 else if (!cased && Py_UNICODE_ISUPPER(ch))
7487 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007489 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007495Return True if S is a titlecased string and there is at least one\n\
7496character in S, i.e. upper- and titlecase characters may only\n\
7497follow uncased characters and lowercase characters only cased ones.\n\
7498Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499
7500static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007501unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502{
7503 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7504 register const Py_UNICODE *e;
7505 int cased, previous_is_cased;
7506
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 /* Shortcut for single character strings */
7508 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7510 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007512 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007513 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007515
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 e = p + PyUnicode_GET_SIZE(self);
7517 cased = 0;
7518 previous_is_cased = 0;
7519 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007521
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7523 if (previous_is_cased)
7524 return PyBool_FromLong(0);
7525 previous_is_cased = 1;
7526 cased = 1;
7527 }
7528 else if (Py_UNICODE_ISLOWER(ch)) {
7529 if (!previous_is_cased)
7530 return PyBool_FromLong(0);
7531 previous_is_cased = 1;
7532 cased = 1;
7533 }
7534 else
7535 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007537 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538}
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007543Return True if all characters in S are whitespace\n\
7544and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007547unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
7549 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7550 register const Py_UNICODE *e;
7551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 /* Shortcut for single character strings */
7553 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 Py_UNICODE_ISSPACE(*p))
7555 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007557 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007558 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007560
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 e = p + PyUnicode_GET_SIZE(self);
7562 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 if (!Py_UNICODE_ISSPACE(*p))
7564 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007566 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567}
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007571\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007572Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007573and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007574
7575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007576unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007577{
7578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7579 register const Py_UNICODE *e;
7580
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007581 /* Shortcut for single character strings */
7582 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 Py_UNICODE_ISALPHA(*p))
7584 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007585
7586 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007587 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007589
7590 e = p + PyUnicode_GET_SIZE(self);
7591 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 if (!Py_UNICODE_ISALPHA(*p))
7593 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007594 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007595 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007596}
7597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007598PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007600\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007601Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007603
7604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007605unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007606{
7607 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7608 register const Py_UNICODE *e;
7609
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007610 /* Shortcut for single character strings */
7611 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 Py_UNICODE_ISALNUM(*p))
7613 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007614
7615 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007616 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007618
7619 e = p + PyUnicode_GET_SIZE(self);
7620 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 if (!Py_UNICODE_ISALNUM(*p))
7622 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007623 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007624 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007625}
7626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007627PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007630Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007631False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
7633static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007634unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635{
7636 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7637 register const Py_UNICODE *e;
7638
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 /* Shortcut for single character strings */
7640 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 Py_UNICODE_ISDECIMAL(*p))
7642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007644 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007645 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007647
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 e = p + PyUnicode_GET_SIZE(self);
7649 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 if (!Py_UNICODE_ISDECIMAL(*p))
7651 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007653 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654}
7655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007656PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007659Return True if all characters in S are digits\n\
7660and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
7662static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007663unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664{
7665 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7666 register const Py_UNICODE *e;
7667
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 /* Shortcut for single character strings */
7669 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 Py_UNICODE_ISDIGIT(*p))
7671 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007673 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007674 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007676
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 e = p + PyUnicode_GET_SIZE(self);
7678 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 if (!Py_UNICODE_ISDIGIT(*p))
7680 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007682 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007688Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007692unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
7694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7695 register const Py_UNICODE *e;
7696
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 /* Shortcut for single character strings */
7698 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 Py_UNICODE_ISNUMERIC(*p))
7700 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007702 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007703 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007705
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 e = p + PyUnicode_GET_SIZE(self);
7707 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 if (!Py_UNICODE_ISNUMERIC(*p))
7709 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007711 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Martin v. Löwis47383402007-08-15 07:32:56 +00007714int
7715PyUnicode_IsIdentifier(PyObject *self)
7716{
7717 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7718 register const Py_UNICODE *e;
7719
7720 /* Special case for empty strings */
7721 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007723
7724 /* PEP 3131 says that the first character must be in
7725 XID_Start and subsequent characters in XID_Continue,
7726 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007728 letters, digits, underscore). However, given the current
7729 definition of XID_Start and XID_Continue, it is sufficient
7730 to check just for these, except that _ must be allowed
7731 as starting an identifier. */
7732 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7733 return 0;
7734
7735 e = p + PyUnicode_GET_SIZE(self);
7736 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 if (!_PyUnicode_IsXidContinue(*p))
7738 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007739 }
7740 return 1;
7741}
7742
7743PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007745\n\
7746Return True if S is a valid identifier according\n\
7747to the language definition.");
7748
7749static PyObject*
7750unicode_isidentifier(PyObject *self)
7751{
7752 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7753}
7754
Georg Brandl559e5d72008-06-11 18:37:52 +00007755PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007757\n\
7758Return True if all characters in S are considered\n\
7759printable in repr() or S is empty, False otherwise.");
7760
7761static PyObject*
7762unicode_isprintable(PyObject *self)
7763{
7764 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7765 register const Py_UNICODE *e;
7766
7767 /* Shortcut for single character strings */
7768 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7769 Py_RETURN_TRUE;
7770 }
7771
7772 e = p + PyUnicode_GET_SIZE(self);
7773 for (; p < e; p++) {
7774 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7775 Py_RETURN_FALSE;
7776 }
7777 }
7778 Py_RETURN_TRUE;
7779}
7780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007782 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783\n\
7784Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007785iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
7787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007788unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007790 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791}
7792
Martin v. Löwis18e16552006-02-15 17:27:45 +00007793static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794unicode_length(PyUnicodeObject *self)
7795{
7796 return self->length;
7797}
7798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007802Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007803done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject *
7806unicode_ljust(PyUnicodeObject *self, PyObject *args)
7807{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007808 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007809 Py_UNICODE fillchar = ' ';
7810
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007811 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 return NULL;
7813
Tim Peters7a29bd52001-09-12 03:03:31 +00007814 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 Py_INCREF(self);
7816 return (PyObject*) self;
7817 }
7818
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007819 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820}
7821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
7827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007828unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 return fixup(self, fixlower);
7831}
7832
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833#define LEFTSTRIP 0
7834#define RIGHTSTRIP 1
7835#define BOTHSTRIP 2
7836
7837/* Arrays indexed by above */
7838static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7839
7840#define STRIPNAME(i) (stripformat[i]+3)
7841
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007842/* externally visible for str.strip(unicode) */
7843PyObject *
7844_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7845{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7847 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7848 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7849 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7850 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007853
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 i = 0;
7855 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7857 i++;
7858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007860
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 j = len;
7862 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 do {
7864 j--;
7865 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7866 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007868
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 Py_INCREF(self);
7871 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 }
7873 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007875}
7876
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877
7878static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007879do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7882 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007883
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 i = 0;
7885 if (striptype != RIGHTSTRIP) {
7886 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7887 i++;
7888 }
7889 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007890
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 j = len;
7892 if (striptype != LEFTSTRIP) {
7893 do {
7894 j--;
7895 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7896 j++;
7897 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007898
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7900 Py_INCREF(self);
7901 return (PyObject*)self;
7902 }
7903 else
7904 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905}
7906
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007907
7908static PyObject *
7909do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007911 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007912
Benjamin Peterson14339b62009-01-31 16:36:08 +00007913 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7914 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007915
Benjamin Peterson14339b62009-01-31 16:36:08 +00007916 if (sep != NULL && sep != Py_None) {
7917 if (PyUnicode_Check(sep))
7918 return _PyUnicode_XStrip(self, striptype, sep);
7919 else {
7920 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 "%s arg must be None or str",
7922 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007923 return NULL;
7924 }
7925 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007926
Benjamin Peterson14339b62009-01-31 16:36:08 +00007927 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007928}
7929
7930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007931PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007933\n\
7934Return a copy of the string S with leading and trailing\n\
7935whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007936If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007937
7938static PyObject *
7939unicode_strip(PyUnicodeObject *self, PyObject *args)
7940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007941 if (PyTuple_GET_SIZE(args) == 0)
7942 return do_strip(self, BOTHSTRIP); /* Common case */
7943 else
7944 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007945}
7946
7947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007948PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007950\n\
7951Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007952If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007953
7954static PyObject *
7955unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 if (PyTuple_GET_SIZE(args) == 0)
7958 return do_strip(self, LEFTSTRIP); /* Common case */
7959 else
7960 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007961}
7962
7963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007964PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007966\n\
7967Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007968If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007969
7970static PyObject *
7971unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7972{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973 if (PyTuple_GET_SIZE(args) == 0)
7974 return do_strip(self, RIGHTSTRIP); /* Common case */
7975 else
7976 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007977}
7978
7979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
7983 PyUnicodeObject *u;
7984 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007986 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987
Georg Brandl222de0f2009-04-12 12:01:50 +00007988 if (len < 1) {
7989 Py_INCREF(unicode_empty);
7990 return (PyObject *)unicode_empty;
7991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992
Tim Peters7a29bd52001-09-12 03:03:31 +00007993 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 /* no repeat, return original string */
7995 Py_INCREF(str);
7996 return (PyObject*) str;
7997 }
Tim Peters8f422462000-09-09 06:13:41 +00007998
7999 /* ensure # of chars needed doesn't overflow int and # of bytes
8000 * needed doesn't overflow size_t
8001 */
8002 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008003 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008004 PyErr_SetString(PyExc_OverflowError,
8005 "repeated string is too long");
8006 return NULL;
8007 }
8008 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8009 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8010 PyErr_SetString(PyExc_OverflowError,
8011 "repeated string is too long");
8012 return NULL;
8013 }
8014 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 if (!u)
8016 return NULL;
8017
8018 p = u->str;
8019
Georg Brandl222de0f2009-04-12 12:01:50 +00008020 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008021 Py_UNICODE_FILL(p, str->str[0], len);
8022 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008023 Py_ssize_t done = str->length; /* number of characters copied this far */
8024 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008026 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008027 Py_UNICODE_COPY(p+done, p, n);
8028 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
8031
8032 return (PyObject*) u;
8033}
8034
8035PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 PyObject *subobj,
8037 PyObject *replobj,
8038 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
8040 PyObject *self;
8041 PyObject *str1;
8042 PyObject *str2;
8043 PyObject *result;
8044
8045 self = PyUnicode_FromObject(obj);
8046 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 str1 = PyUnicode_FromObject(subobj);
8049 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 Py_DECREF(self);
8051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
8053 str2 = PyUnicode_FromObject(replobj);
8054 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 Py_DECREF(self);
8056 Py_DECREF(str1);
8057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 }
Tim Petersced69f82003-09-16 20:30:58 +00008059 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 (PyUnicodeObject *)str1,
8061 (PyUnicodeObject *)str2,
8062 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 Py_DECREF(self);
8064 Py_DECREF(str1);
8065 Py_DECREF(str2);
8066 return result;
8067}
8068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008069PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071\n\
8072Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008073old replaced by new. If the optional argument count is\n\
8074given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076static PyObject*
8077unicode_replace(PyUnicodeObject *self, PyObject *args)
8078{
8079 PyUnicodeObject *str1;
8080 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 PyObject *result;
8083
Martin v. Löwis18e16552006-02-15 17:27:45 +00008084 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 return NULL;
8086 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8087 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008090 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 Py_DECREF(str1);
8092 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094
8095 result = replace(self, str1, str2, maxcount);
8096
8097 Py_DECREF(str1);
8098 Py_DECREF(str2);
8099 return result;
8100}
8101
8102static
8103PyObject *unicode_repr(PyObject *unicode)
8104{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008105 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008106 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008107 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8108 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8109
8110 /* XXX(nnorwitz): rather than over-allocating, it would be
8111 better to choose a different scheme. Perhaps scan the
8112 first N-chars of the string and allocate based on that size.
8113 */
8114 /* Initial allocation is based on the longest-possible unichr
8115 escape.
8116
8117 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8118 unichr, so in this case it's the longest unichr escape. In
8119 narrow (UTF-16) builds this is five chars per source unichr
8120 since there are two unichrs in the surrogate pair, so in narrow
8121 (UTF-16) builds it's not the longest unichr escape.
8122
8123 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8124 so in the narrow (UTF-16) build case it's the longest unichr
8125 escape.
8126 */
8127
Walter Dörwald1ab83302007-05-18 17:15:44 +00008128 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008130#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008132#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008134#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008136 if (repr == NULL)
8137 return NULL;
8138
Walter Dörwald1ab83302007-05-18 17:15:44 +00008139 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008140
8141 /* Add quote */
8142 *p++ = (findchar(s, size, '\'') &&
8143 !findchar(s, size, '"')) ? '"' : '\'';
8144 while (size-- > 0) {
8145 Py_UNICODE ch = *s++;
8146
8147 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008148 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008149 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008150 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008151 continue;
8152 }
8153
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008155 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008156 *p++ = '\\';
8157 *p++ = 't';
8158 }
8159 else if (ch == '\n') {
8160 *p++ = '\\';
8161 *p++ = 'n';
8162 }
8163 else if (ch == '\r') {
8164 *p++ = '\\';
8165 *p++ = 'r';
8166 }
8167
8168 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008169 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008170 *p++ = '\\';
8171 *p++ = 'x';
8172 *p++ = hexdigits[(ch >> 4) & 0x000F];
8173 *p++ = hexdigits[ch & 0x000F];
8174 }
8175
Georg Brandl559e5d72008-06-11 18:37:52 +00008176 /* Copy ASCII characters as-is */
8177 else if (ch < 0x7F) {
8178 *p++ = ch;
8179 }
8180
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008182 else {
8183 Py_UCS4 ucs = ch;
8184
8185#ifndef Py_UNICODE_WIDE
8186 Py_UNICODE ch2 = 0;
8187 /* Get code point from surrogate pair */
8188 if (size > 0) {
8189 ch2 = *s;
8190 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008195 size--;
8196 }
8197 }
8198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008200 (categories Z* and C* except ASCII space)
8201 */
8202 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8203 /* Map 8-bit characters to '\xhh' */
8204 if (ucs <= 0xff) {
8205 *p++ = '\\';
8206 *p++ = 'x';
8207 *p++ = hexdigits[(ch >> 4) & 0x000F];
8208 *p++ = hexdigits[ch & 0x000F];
8209 }
8210 /* Map 21-bit characters to '\U00xxxxxx' */
8211 else if (ucs >= 0x10000) {
8212 *p++ = '\\';
8213 *p++ = 'U';
8214 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8215 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8216 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8217 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8218 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8219 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8220 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8221 *p++ = hexdigits[ucs & 0x0000000F];
8222 }
8223 /* Map 16-bit characters to '\uxxxx' */
8224 else {
8225 *p++ = '\\';
8226 *p++ = 'u';
8227 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8228 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8229 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8230 *p++ = hexdigits[ucs & 0x000F];
8231 }
8232 }
8233 /* Copy characters as-is */
8234 else {
8235 *p++ = ch;
8236#ifndef Py_UNICODE_WIDE
8237 if (ucs >= 0x10000)
8238 *p++ = ch2;
8239#endif
8240 }
8241 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008242 }
8243 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008244 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008245
8246 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008247 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249}
8250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008251PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253\n\
8254Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008255such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256arguments start and end are interpreted as in slice notation.\n\
8257\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008258Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259
8260static PyObject *
8261unicode_rfind(PyUnicodeObject *self, PyObject *args)
8262{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008263 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008264 Py_ssize_t start;
8265 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
Christian Heimes9cd17752007-11-18 19:35:23 +00008268 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270
Thomas Wouters477c8d52006-05-27 19:21:47 +00008271 result = stringlib_rfind_slice(
8272 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8273 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8274 start, end
8275 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
8277 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008278
Christian Heimes217cfd12007-12-02 14:31:20 +00008279 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280}
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008285Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
8287static PyObject *
8288unicode_rindex(PyUnicodeObject *self, PyObject *args)
8289{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008290 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008291 Py_ssize_t start;
8292 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008293 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
Christian Heimes9cd17752007-11-18 19:35:23 +00008295 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Thomas Wouters477c8d52006-05-27 19:21:47 +00008298 result = stringlib_rfind_slice(
8299 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8300 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8301 start, end
8302 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303
8304 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008305
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 if (result < 0) {
8307 PyErr_SetString(PyExc_ValueError, "substring not found");
8308 return NULL;
8309 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311}
8312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008313PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008316Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008317done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318
8319static PyObject *
8320unicode_rjust(PyUnicodeObject *self, PyObject *args)
8321{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008322 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008323 Py_UNICODE fillchar = ' ';
8324
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008325 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 return NULL;
8327
Tim Peters7a29bd52001-09-12 03:03:31 +00008328 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 Py_INCREF(self);
8330 return (PyObject*) self;
8331 }
8332
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008333 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334}
8335
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 PyObject *sep,
8338 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339{
8340 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008341
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 s = PyUnicode_FromObject(s);
8343 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 if (sep != NULL) {
8346 sep = PyUnicode_FromObject(sep);
8347 if (sep == NULL) {
8348 Py_DECREF(s);
8349 return NULL;
8350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
8352
8353 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8354
8355 Py_DECREF(s);
8356 Py_XDECREF(sep);
8357 return result;
8358}
8359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008360PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362\n\
8363Return a list of the words in S, using sep as the\n\
8364delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008365splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008366whitespace string is a separator and empty strings are\n\
8367removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368
8369static PyObject*
8370unicode_split(PyUnicodeObject *self, PyObject *args)
8371{
8372 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374
Martin v. Löwis18e16552006-02-15 17:27:45 +00008375 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 return NULL;
8377
8378 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384}
8385
Thomas Wouters477c8d52006-05-27 19:21:47 +00008386PyObject *
8387PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8388{
8389 PyObject* str_obj;
8390 PyObject* sep_obj;
8391 PyObject* out;
8392
8393 str_obj = PyUnicode_FromObject(str_in);
8394 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396 sep_obj = PyUnicode_FromObject(sep_in);
8397 if (!sep_obj) {
8398 Py_DECREF(str_obj);
8399 return NULL;
8400 }
8401
8402 out = stringlib_partition(
8403 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8404 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8405 );
8406
8407 Py_DECREF(sep_obj);
8408 Py_DECREF(str_obj);
8409
8410 return out;
8411}
8412
8413
8414PyObject *
8415PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8416{
8417 PyObject* str_obj;
8418 PyObject* sep_obj;
8419 PyObject* out;
8420
8421 str_obj = PyUnicode_FromObject(str_in);
8422 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424 sep_obj = PyUnicode_FromObject(sep_in);
8425 if (!sep_obj) {
8426 Py_DECREF(str_obj);
8427 return NULL;
8428 }
8429
8430 out = stringlib_rpartition(
8431 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8432 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8433 );
8434
8435 Py_DECREF(sep_obj);
8436 Py_DECREF(str_obj);
8437
8438 return out;
8439}
8440
8441PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008443\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008444Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008445the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008446found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008447
8448static PyObject*
8449unicode_partition(PyUnicodeObject *self, PyObject *separator)
8450{
8451 return PyUnicode_Partition((PyObject *)self, separator);
8452}
8453
8454PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008456\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008457Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008458the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008459separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008460
8461static PyObject*
8462unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8463{
8464 return PyUnicode_RPartition((PyObject *)self, separator);
8465}
8466
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008467PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 PyObject *sep,
8469 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008470{
8471 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008473 s = PyUnicode_FromObject(s);
8474 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 if (sep != NULL) {
8477 sep = PyUnicode_FromObject(sep);
8478 if (sep == NULL) {
8479 Py_DECREF(s);
8480 return NULL;
8481 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008482 }
8483
8484 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8485
8486 Py_DECREF(s);
8487 Py_XDECREF(sep);
8488 return result;
8489}
8490
8491PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008493\n\
8494Return a list of the words in S, using sep as the\n\
8495delimiter string, starting at the end of the string and\n\
8496working to the front. If maxsplit is given, at most maxsplit\n\
8497splits are done. If sep is not specified, any whitespace string\n\
8498is a separator.");
8499
8500static PyObject*
8501unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8502{
8503 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008504 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008505
Martin v. Löwis18e16552006-02-15 17:27:45 +00008506 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008507 return NULL;
8508
8509 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008511 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008513 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008515}
8516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008517PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519\n\
8520Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008521Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008522is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
8524static PyObject*
8525unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8526{
Guido van Rossum86662912000-04-11 15:38:46 +00008527 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528
Guido van Rossum86662912000-04-11 15:38:46 +00008529 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 return NULL;
8531
Guido van Rossum86662912000-04-11 15:38:46 +00008532 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533}
8534
8535static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008536PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Walter Dörwald346737f2007-05-31 10:44:43 +00008538 if (PyUnicode_CheckExact(self)) {
8539 Py_INCREF(self);
8540 return self;
8541 } else
8542 /* Subtype -- return genuine unicode string with the same value. */
8543 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8544 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545}
8546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008547PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549\n\
8550Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008551and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
8553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008554unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 return fixup(self, fixswapcase);
8557}
8558
Georg Brandlceee0772007-11-27 23:48:05 +00008559PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008561\n\
8562Return a translation table usable for str.translate().\n\
8563If there is only one argument, it must be a dictionary mapping Unicode\n\
8564ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008565Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008566If there are two arguments, they must be strings of equal length, and\n\
8567in the resulting dictionary, each character in x will be mapped to the\n\
8568character at the same position in y. If there is a third argument, it\n\
8569must be a string, whose characters will be mapped to None in the result.");
8570
8571static PyObject*
8572unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8573{
8574 PyObject *x, *y = NULL, *z = NULL;
8575 PyObject *new = NULL, *key, *value;
8576 Py_ssize_t i = 0;
8577 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578
Georg Brandlceee0772007-11-27 23:48:05 +00008579 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8580 return NULL;
8581 new = PyDict_New();
8582 if (!new)
8583 return NULL;
8584 if (y != NULL) {
8585 /* x must be a string too, of equal length */
8586 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8587 if (!PyUnicode_Check(x)) {
8588 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8589 "be a string if there is a second argument");
8590 goto err;
8591 }
8592 if (PyUnicode_GET_SIZE(x) != ylen) {
8593 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8594 "arguments must have equal length");
8595 goto err;
8596 }
8597 /* create entries for translating chars in x to those in y */
8598 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008599 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8600 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008601 if (!key || !value)
8602 goto err;
8603 res = PyDict_SetItem(new, key, value);
8604 Py_DECREF(key);
8605 Py_DECREF(value);
8606 if (res < 0)
8607 goto err;
8608 }
8609 /* create entries for deleting chars in z */
8610 if (z != NULL) {
8611 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008612 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008613 if (!key)
8614 goto err;
8615 res = PyDict_SetItem(new, key, Py_None);
8616 Py_DECREF(key);
8617 if (res < 0)
8618 goto err;
8619 }
8620 }
8621 } else {
8622 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008623 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008624 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8625 "to maketrans it must be a dict");
8626 goto err;
8627 }
8628 /* copy entries into the new dict, converting string keys to int keys */
8629 while (PyDict_Next(x, &i, &key, &value)) {
8630 if (PyUnicode_Check(key)) {
8631 /* convert string keys to integer keys */
8632 PyObject *newkey;
8633 if (PyUnicode_GET_SIZE(key) != 1) {
8634 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8635 "table must be of length 1");
8636 goto err;
8637 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008638 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008639 if (!newkey)
8640 goto err;
8641 res = PyDict_SetItem(new, newkey, value);
8642 Py_DECREF(newkey);
8643 if (res < 0)
8644 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008645 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008646 /* just keep integer keys */
8647 if (PyDict_SetItem(new, key, value) < 0)
8648 goto err;
8649 } else {
8650 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8651 "be strings or integers");
8652 goto err;
8653 }
8654 }
8655 }
8656 return new;
8657 err:
8658 Py_DECREF(new);
8659 return NULL;
8660}
8661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008662PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664\n\
8665Return a copy of the string S, where all characters have been mapped\n\
8666through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008667Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008668Unmapped characters are left untouched. Characters mapped to None\n\
8669are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
8671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008672unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673{
Georg Brandlceee0772007-11-27 23:48:05 +00008674 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675}
8676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008677PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008680Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
8682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008683unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 return fixup(self, fixupper);
8686}
8687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008688PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008691Pad a numeric string S with zeros on the left, to fill a field\n\
8692of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
8694static PyObject *
8695unicode_zfill(PyUnicodeObject *self, PyObject *args)
8696{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 PyUnicodeObject *u;
8699
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 Py_ssize_t width;
8701 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 return NULL;
8703
8704 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008705 if (PyUnicode_CheckExact(self)) {
8706 Py_INCREF(self);
8707 return (PyObject*) self;
8708 }
8709 else
8710 return PyUnicode_FromUnicode(
8711 PyUnicode_AS_UNICODE(self),
8712 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 }
8715
8716 fill = width - self->length;
8717
8718 u = pad(self, fill, 0, '0');
8719
Walter Dörwald068325e2002-04-15 13:36:47 +00008720 if (u == NULL)
8721 return NULL;
8722
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 if (u->str[fill] == '+' || u->str[fill] == '-') {
8724 /* move sign to beginning of string */
8725 u->str[0] = u->str[fill];
8726 u->str[fill] = '0';
8727 }
8728
8729 return (PyObject*) u;
8730}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731
8732#if 0
8733static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008734unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735{
Christian Heimes2202f872008-02-06 14:31:34 +00008736 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737}
8738#endif
8739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008740PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008743Return True if S starts with the specified prefix, False otherwise.\n\
8744With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008745With optional end, stop comparing S at that position.\n\
8746prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
8748static PyObject *
8749unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008752 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008754 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008755 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008756 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008758 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8760 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008761 if (PyTuple_Check(subobj)) {
8762 Py_ssize_t i;
8763 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8764 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008766 if (substring == NULL)
8767 return NULL;
8768 result = tailmatch(self, substring, start, end, -1);
8769 Py_DECREF(substring);
8770 if (result) {
8771 Py_RETURN_TRUE;
8772 }
8773 }
8774 /* nothing matched */
8775 Py_RETURN_FALSE;
8776 }
8777 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008780 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008782 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783}
8784
8785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008786PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008789Return True if S ends with the specified suffix, False otherwise.\n\
8790With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008791With optional end, stop comparing S at that position.\n\
8792suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793
8794static PyObject *
8795unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008798 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008800 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008801 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008802 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008804 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8806 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008807 if (PyTuple_Check(subobj)) {
8808 Py_ssize_t i;
8809 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8810 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008812 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008814 result = tailmatch(self, substring, start, end, +1);
8815 Py_DECREF(substring);
8816 if (result) {
8817 Py_RETURN_TRUE;
8818 }
8819 }
8820 Py_RETURN_FALSE;
8821 }
8822 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008826 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008828 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829}
8830
Eric Smith8c663262007-08-25 02:26:07 +00008831#include "stringlib/string_format.h"
8832
8833PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008835\n\
8836");
8837
Eric Smith4a7d76d2008-05-30 18:10:19 +00008838static PyObject *
8839unicode__format__(PyObject* self, PyObject* args)
8840{
8841 PyObject *format_spec;
8842
8843 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8844 return NULL;
8845
8846 return _PyUnicode_FormatAdvanced(self,
8847 PyUnicode_AS_UNICODE(format_spec),
8848 PyUnicode_GET_SIZE(format_spec));
8849}
8850
Eric Smith8c663262007-08-25 02:26:07 +00008851PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008853\n\
8854");
8855
8856static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008857unicode__sizeof__(PyUnicodeObject *v)
8858{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008859 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8860 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008861}
8862
8863PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008865
8866static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008867unicode_getnewargs(PyUnicodeObject *v)
8868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008869 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008870}
8871
8872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873static PyMethodDef unicode_methods[] = {
8874
8875 /* Order is according to common usage: often used methods should
8876 appear first, since lookup is done sequentially. */
8877
Benjamin Peterson308d6372009-09-18 21:42:35 +00008878 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008879 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8880 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008881 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008882 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8883 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8884 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8885 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8886 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8887 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8888 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008889 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008890 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8891 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8892 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008893 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008894 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8895 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8896 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008897 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008898 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008899 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008900 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008901 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8902 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8903 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8904 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8905 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8906 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8907 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8908 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8909 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8910 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8911 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8912 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8913 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8914 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008915 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008916 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008917 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008918 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008919 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008920 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8921 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008922 {"maketrans", (PyCFunction) unicode_maketrans,
8923 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008924 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008925#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008926 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927#endif
8928
8929#if 0
8930 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008931 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932#endif
8933
Benjamin Peterson14339b62009-01-31 16:36:08 +00008934 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 {NULL, NULL}
8936};
8937
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008938static PyObject *
8939unicode_mod(PyObject *v, PyObject *w)
8940{
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 if (!PyUnicode_Check(v)) {
8942 Py_INCREF(Py_NotImplemented);
8943 return Py_NotImplemented;
8944 }
8945 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008946}
8947
8948static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008949 0, /*nb_add*/
8950 0, /*nb_subtract*/
8951 0, /*nb_multiply*/
8952 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008953};
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008956 (lenfunc) unicode_length, /* sq_length */
8957 PyUnicode_Concat, /* sq_concat */
8958 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8959 (ssizeargfunc) unicode_getitem, /* sq_item */
8960 0, /* sq_slice */
8961 0, /* sq_ass_item */
8962 0, /* sq_ass_slice */
8963 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964};
8965
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008966static PyObject*
8967unicode_subscript(PyUnicodeObject* self, PyObject* item)
8968{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008969 if (PyIndex_Check(item)) {
8970 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008971 if (i == -1 && PyErr_Occurred())
8972 return NULL;
8973 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008974 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008975 return unicode_getitem(self, i);
8976 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008977 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008978 Py_UNICODE* source_buf;
8979 Py_UNICODE* result_buf;
8980 PyObject* result;
8981
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008982 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008984 return NULL;
8985 }
8986
8987 if (slicelength <= 0) {
8988 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008989 } else if (start == 0 && step == 1 && slicelength == self->length &&
8990 PyUnicode_CheckExact(self)) {
8991 Py_INCREF(self);
8992 return (PyObject *)self;
8993 } else if (step == 1) {
8994 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008995 } else {
8996 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008997 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8998 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 if (result_buf == NULL)
9001 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009002
9003 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9004 result_buf[i] = source_buf[cur];
9005 }
Tim Petersced69f82003-09-16 20:30:58 +00009006
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009007 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009008 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009009 return result;
9010 }
9011 } else {
9012 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9013 return NULL;
9014 }
9015}
9016
9017static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009018 (lenfunc)unicode_length, /* mp_length */
9019 (binaryfunc)unicode_subscript, /* mp_subscript */
9020 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009021};
9022
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024/* Helpers for PyUnicode_Format() */
9025
9026static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009029 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 (*p_argidx)++;
9032 if (arglen < 0)
9033 return args;
9034 else
9035 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 }
9037 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 return NULL;
9040}
9041
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009042/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009044static PyObject *
9045formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009047 char *p;
9048 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 x = PyFloat_AsDouble(v);
9052 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009053 return NULL;
9054
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009057
Eric Smith0923d1d2009-04-16 20:16:10 +00009058 p = PyOS_double_to_string(x, type, prec,
9059 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009060 if (p == NULL)
9061 return NULL;
9062 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009063 PyMem_Free(p);
9064 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
9066
Tim Peters38fd5b62000-09-21 05:43:11 +00009067static PyObject*
9068formatlong(PyObject *val, int flags, int prec, int type)
9069{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009070 char *buf;
9071 int len;
9072 PyObject *str; /* temporary string object. */
9073 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009074
Benjamin Peterson14339b62009-01-31 16:36:08 +00009075 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9076 if (!str)
9077 return NULL;
9078 result = PyUnicode_FromStringAndSize(buf, len);
9079 Py_DECREF(str);
9080 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009081}
9082
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083static int
9084formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009085 size_t buflen,
9086 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009088 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009089 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 if (PyUnicode_GET_SIZE(v) == 1) {
9091 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9092 buf[1] = '\0';
9093 return 1;
9094 }
9095#ifndef Py_UNICODE_WIDE
9096 if (PyUnicode_GET_SIZE(v) == 2) {
9097 /* Decode a valid surrogate pair */
9098 int c0 = PyUnicode_AS_UNICODE(v)[0];
9099 int c1 = PyUnicode_AS_UNICODE(v)[1];
9100 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9101 0xDC00 <= c1 && c1 <= 0xDFFF) {
9102 buf[0] = c0;
9103 buf[1] = c1;
9104 buf[2] = '\0';
9105 return 2;
9106 }
9107 }
9108#endif
9109 goto onError;
9110 }
9111 else {
9112 /* Integer input truncated to a character */
9113 long x;
9114 x = PyLong_AsLong(v);
9115 if (x == -1 && PyErr_Occurred())
9116 goto onError;
9117
9118 if (x < 0 || x > 0x10ffff) {
9119 PyErr_SetString(PyExc_OverflowError,
9120 "%c arg not in range(0x110000)");
9121 return -1;
9122 }
9123
9124#ifndef Py_UNICODE_WIDE
9125 if (x > 0xffff) {
9126 x -= 0x10000;
9127 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9128 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9129 return 2;
9130 }
9131#endif
9132 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 buf[1] = '\0';
9134 return 1;
9135 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009136
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009138 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009140 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141}
9142
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009143/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009144 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009145*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009146#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009147
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
9151 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009152 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 int args_owned = 0;
9154 PyUnicodeObject *result = NULL;
9155 PyObject *dict = NULL;
9156 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009157
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 PyErr_BadInternalCall();
9160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 }
9162 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009163 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165 fmt = PyUnicode_AS_UNICODE(uformat);
9166 fmtcnt = PyUnicode_GET_SIZE(uformat);
9167
9168 reslen = rescnt = fmtcnt + 100;
9169 result = _PyUnicode_New(reslen);
9170 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 res = PyUnicode_AS_UNICODE(result);
9173
9174 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 arglen = PyTuple_Size(args);
9176 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 }
9178 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 arglen = -1;
9180 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009182 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009183 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185
9186 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 if (*fmt != '%') {
9188 if (--rescnt < 0) {
9189 rescnt = fmtcnt + 100;
9190 reslen += rescnt;
9191 if (_PyUnicode_Resize(&result, reslen) < 0)
9192 goto onError;
9193 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9194 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009197 }
9198 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 /* Got a format specifier */
9200 int flags = 0;
9201 Py_ssize_t width = -1;
9202 int prec = -1;
9203 Py_UNICODE c = '\0';
9204 Py_UNICODE fill;
9205 int isnumok;
9206 PyObject *v = NULL;
9207 PyObject *temp = NULL;
9208 Py_UNICODE *pbuf;
9209 Py_UNICODE sign;
9210 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009211 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 fmt++;
9214 if (*fmt == '(') {
9215 Py_UNICODE *keystart;
9216 Py_ssize_t keylen;
9217 PyObject *key;
9218 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009219
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 if (dict == NULL) {
9221 PyErr_SetString(PyExc_TypeError,
9222 "format requires a mapping");
9223 goto onError;
9224 }
9225 ++fmt;
9226 --fmtcnt;
9227 keystart = fmt;
9228 /* Skip over balanced parentheses */
9229 while (pcount > 0 && --fmtcnt >= 0) {
9230 if (*fmt == ')')
9231 --pcount;
9232 else if (*fmt == '(')
9233 ++pcount;
9234 fmt++;
9235 }
9236 keylen = fmt - keystart - 1;
9237 if (fmtcnt < 0 || pcount > 0) {
9238 PyErr_SetString(PyExc_ValueError,
9239 "incomplete format key");
9240 goto onError;
9241 }
9242#if 0
9243 /* keys are converted to strings using UTF-8 and
9244 then looked up since Python uses strings to hold
9245 variables names etc. in its namespaces and we
9246 wouldn't want to break common idioms. */
9247 key = PyUnicode_EncodeUTF8(keystart,
9248 keylen,
9249 NULL);
9250#else
9251 key = PyUnicode_FromUnicode(keystart, keylen);
9252#endif
9253 if (key == NULL)
9254 goto onError;
9255 if (args_owned) {
9256 Py_DECREF(args);
9257 args_owned = 0;
9258 }
9259 args = PyObject_GetItem(dict, key);
9260 Py_DECREF(key);
9261 if (args == NULL) {
9262 goto onError;
9263 }
9264 args_owned = 1;
9265 arglen = -1;
9266 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009267 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009268 while (--fmtcnt >= 0) {
9269 switch (c = *fmt++) {
9270 case '-': flags |= F_LJUST; continue;
9271 case '+': flags |= F_SIGN; continue;
9272 case ' ': flags |= F_BLANK; continue;
9273 case '#': flags |= F_ALT; continue;
9274 case '0': flags |= F_ZERO; continue;
9275 }
9276 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009277 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009278 if (c == '*') {
9279 v = getnextarg(args, arglen, &argidx);
9280 if (v == NULL)
9281 goto onError;
9282 if (!PyLong_Check(v)) {
9283 PyErr_SetString(PyExc_TypeError,
9284 "* wants int");
9285 goto onError;
9286 }
9287 width = PyLong_AsLong(v);
9288 if (width == -1 && PyErr_Occurred())
9289 goto onError;
9290 if (width < 0) {
9291 flags |= F_LJUST;
9292 width = -width;
9293 }
9294 if (--fmtcnt >= 0)
9295 c = *fmt++;
9296 }
9297 else if (c >= '0' && c <= '9') {
9298 width = c - '0';
9299 while (--fmtcnt >= 0) {
9300 c = *fmt++;
9301 if (c < '0' || c > '9')
9302 break;
9303 if ((width*10) / 10 != width) {
9304 PyErr_SetString(PyExc_ValueError,
9305 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009306 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 }
9308 width = width*10 + (c - '0');
9309 }
9310 }
9311 if (c == '.') {
9312 prec = 0;
9313 if (--fmtcnt >= 0)
9314 c = *fmt++;
9315 if (c == '*') {
9316 v = getnextarg(args, arglen, &argidx);
9317 if (v == NULL)
9318 goto onError;
9319 if (!PyLong_Check(v)) {
9320 PyErr_SetString(PyExc_TypeError,
9321 "* wants int");
9322 goto onError;
9323 }
9324 prec = PyLong_AsLong(v);
9325 if (prec == -1 && PyErr_Occurred())
9326 goto onError;
9327 if (prec < 0)
9328 prec = 0;
9329 if (--fmtcnt >= 0)
9330 c = *fmt++;
9331 }
9332 else if (c >= '0' && c <= '9') {
9333 prec = c - '0';
9334 while (--fmtcnt >= 0) {
9335 c = Py_CHARMASK(*fmt++);
9336 if (c < '0' || c > '9')
9337 break;
9338 if ((prec*10) / 10 != prec) {
9339 PyErr_SetString(PyExc_ValueError,
9340 "prec too big");
9341 goto onError;
9342 }
9343 prec = prec*10 + (c - '0');
9344 }
9345 }
9346 } /* prec */
9347 if (fmtcnt >= 0) {
9348 if (c == 'h' || c == 'l' || c == 'L') {
9349 if (--fmtcnt >= 0)
9350 c = *fmt++;
9351 }
9352 }
9353 if (fmtcnt < 0) {
9354 PyErr_SetString(PyExc_ValueError,
9355 "incomplete format");
9356 goto onError;
9357 }
9358 if (c != '%') {
9359 v = getnextarg(args, arglen, &argidx);
9360 if (v == NULL)
9361 goto onError;
9362 }
9363 sign = 0;
9364 fill = ' ';
9365 switch (c) {
9366
9367 case '%':
9368 pbuf = formatbuf;
9369 /* presume that buffer length is at least 1 */
9370 pbuf[0] = '%';
9371 len = 1;
9372 break;
9373
9374 case 's':
9375 case 'r':
9376 case 'a':
9377 if (PyUnicode_Check(v) && c == 's') {
9378 temp = v;
9379 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380 }
9381 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 if (c == 's')
9383 temp = PyObject_Str(v);
9384 else if (c == 'r')
9385 temp = PyObject_Repr(v);
9386 else
9387 temp = PyObject_ASCII(v);
9388 if (temp == NULL)
9389 goto onError;
9390 if (PyUnicode_Check(temp))
9391 /* nothing to do */;
9392 else {
9393 Py_DECREF(temp);
9394 PyErr_SetString(PyExc_TypeError,
9395 "%s argument has non-string str()");
9396 goto onError;
9397 }
9398 }
9399 pbuf = PyUnicode_AS_UNICODE(temp);
9400 len = PyUnicode_GET_SIZE(temp);
9401 if (prec >= 0 && len > prec)
9402 len = prec;
9403 break;
9404
9405 case 'i':
9406 case 'd':
9407 case 'u':
9408 case 'o':
9409 case 'x':
9410 case 'X':
9411 if (c == 'i')
9412 c = 'd';
9413 isnumok = 0;
9414 if (PyNumber_Check(v)) {
9415 PyObject *iobj=NULL;
9416
9417 if (PyLong_Check(v)) {
9418 iobj = v;
9419 Py_INCREF(iobj);
9420 }
9421 else {
9422 iobj = PyNumber_Long(v);
9423 }
9424 if (iobj!=NULL) {
9425 if (PyLong_Check(iobj)) {
9426 isnumok = 1;
9427 temp = formatlong(iobj, flags, prec, c);
9428 Py_DECREF(iobj);
9429 if (!temp)
9430 goto onError;
9431 pbuf = PyUnicode_AS_UNICODE(temp);
9432 len = PyUnicode_GET_SIZE(temp);
9433 sign = 1;
9434 }
9435 else {
9436 Py_DECREF(iobj);
9437 }
9438 }
9439 }
9440 if (!isnumok) {
9441 PyErr_Format(PyExc_TypeError,
9442 "%%%c format: a number is required, "
9443 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9444 goto onError;
9445 }
9446 if (flags & F_ZERO)
9447 fill = '0';
9448 break;
9449
9450 case 'e':
9451 case 'E':
9452 case 'f':
9453 case 'F':
9454 case 'g':
9455 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009456 temp = formatfloat(v, flags, prec, c);
9457 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009459 pbuf = PyUnicode_AS_UNICODE(temp);
9460 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 sign = 1;
9462 if (flags & F_ZERO)
9463 fill = '0';
9464 break;
9465
9466 case 'c':
9467 pbuf = formatbuf;
9468 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9469 if (len < 0)
9470 goto onError;
9471 break;
9472
9473 default:
9474 PyErr_Format(PyExc_ValueError,
9475 "unsupported format character '%c' (0x%x) "
9476 "at index %zd",
9477 (31<=c && c<=126) ? (char)c : '?',
9478 (int)c,
9479 (Py_ssize_t)(fmt - 1 -
9480 PyUnicode_AS_UNICODE(uformat)));
9481 goto onError;
9482 }
9483 if (sign) {
9484 if (*pbuf == '-' || *pbuf == '+') {
9485 sign = *pbuf++;
9486 len--;
9487 }
9488 else if (flags & F_SIGN)
9489 sign = '+';
9490 else if (flags & F_BLANK)
9491 sign = ' ';
9492 else
9493 sign = 0;
9494 }
9495 if (width < len)
9496 width = len;
9497 if (rescnt - (sign != 0) < width) {
9498 reslen -= rescnt;
9499 rescnt = width + fmtcnt + 100;
9500 reslen += rescnt;
9501 if (reslen < 0) {
9502 Py_XDECREF(temp);
9503 PyErr_NoMemory();
9504 goto onError;
9505 }
9506 if (_PyUnicode_Resize(&result, reslen) < 0) {
9507 Py_XDECREF(temp);
9508 goto onError;
9509 }
9510 res = PyUnicode_AS_UNICODE(result)
9511 + reslen - rescnt;
9512 }
9513 if (sign) {
9514 if (fill != ' ')
9515 *res++ = sign;
9516 rescnt--;
9517 if (width > len)
9518 width--;
9519 }
9520 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9521 assert(pbuf[0] == '0');
9522 assert(pbuf[1] == c);
9523 if (fill != ' ') {
9524 *res++ = *pbuf++;
9525 *res++ = *pbuf++;
9526 }
9527 rescnt -= 2;
9528 width -= 2;
9529 if (width < 0)
9530 width = 0;
9531 len -= 2;
9532 }
9533 if (width > len && !(flags & F_LJUST)) {
9534 do {
9535 --rescnt;
9536 *res++ = fill;
9537 } while (--width > len);
9538 }
9539 if (fill == ' ') {
9540 if (sign)
9541 *res++ = sign;
9542 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9543 assert(pbuf[0] == '0');
9544 assert(pbuf[1] == c);
9545 *res++ = *pbuf++;
9546 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009547 }
9548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 Py_UNICODE_COPY(res, pbuf, len);
9550 res += len;
9551 rescnt -= len;
9552 while (--width >= len) {
9553 --rescnt;
9554 *res++ = ' ';
9555 }
9556 if (dict && (argidx < arglen) && c != '%') {
9557 PyErr_SetString(PyExc_TypeError,
9558 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009559 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 goto onError;
9561 }
9562 Py_XDECREF(temp);
9563 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 } /* until end */
9565 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 PyErr_SetString(PyExc_TypeError,
9567 "not all arguments converted during string formatting");
9568 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 }
9570
Thomas Woutersa96affe2006-03-12 00:29:36 +00009571 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575 }
9576 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 return (PyObject *)result;
9578
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580 Py_XDECREF(result);
9581 Py_DECREF(uformat);
9582 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 }
9585 return NULL;
9586}
9587
Jeremy Hylton938ace62002-07-17 16:30:39 +00009588static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009589unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9590
Tim Peters6d6c1a32001-08-02 04:15:00 +00009591static PyObject *
9592unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9593{
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009595 static char *kwlist[] = {"object", "encoding", "errors", 0};
9596 char *encoding = NULL;
9597 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009598
Benjamin Peterson14339b62009-01-31 16:36:08 +00009599 if (type != &PyUnicode_Type)
9600 return unicode_subtype_new(type, args, kwds);
9601 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 return NULL;
9604 if (x == NULL)
9605 return (PyObject *)_PyUnicode_New(0);
9606 if (encoding == NULL && errors == NULL)
9607 return PyObject_Str(x);
9608 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009610}
9611
Guido van Rossume023fe02001-08-30 03:12:59 +00009612static PyObject *
9613unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9614{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 PyUnicodeObject *tmp, *pnew;
9616 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009617
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9619 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9620 if (tmp == NULL)
9621 return NULL;
9622 assert(PyUnicode_Check(tmp));
9623 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9624 if (pnew == NULL) {
9625 Py_DECREF(tmp);
9626 return NULL;
9627 }
9628 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9629 if (pnew->str == NULL) {
9630 _Py_ForgetReference((PyObject *)pnew);
9631 PyObject_Del(pnew);
9632 Py_DECREF(tmp);
9633 return PyErr_NoMemory();
9634 }
9635 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9636 pnew->length = n;
9637 pnew->hash = tmp->hash;
9638 Py_DECREF(tmp);
9639 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009640}
9641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009642PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009644\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009645Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009646encoding defaults to the current default string encoding.\n\
9647errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009648
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009649static PyObject *unicode_iter(PyObject *seq);
9650
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009652 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009653 "str", /* tp_name */
9654 sizeof(PyUnicodeObject), /* tp_size */
9655 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009657 (destructor)unicode_dealloc, /* tp_dealloc */
9658 0, /* tp_print */
9659 0, /* tp_getattr */
9660 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009661 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009662 unicode_repr, /* tp_repr */
9663 &unicode_as_number, /* tp_as_number */
9664 &unicode_as_sequence, /* tp_as_sequence */
9665 &unicode_as_mapping, /* tp_as_mapping */
9666 (hashfunc) unicode_hash, /* tp_hash*/
9667 0, /* tp_call*/
9668 (reprfunc) unicode_str, /* tp_str */
9669 PyObject_GenericGetAttr, /* tp_getattro */
9670 0, /* tp_setattro */
9671 0, /* tp_as_buffer */
9672 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 unicode_doc, /* tp_doc */
9675 0, /* tp_traverse */
9676 0, /* tp_clear */
9677 PyUnicode_RichCompare, /* tp_richcompare */
9678 0, /* tp_weaklistoffset */
9679 unicode_iter, /* tp_iter */
9680 0, /* tp_iternext */
9681 unicode_methods, /* tp_methods */
9682 0, /* tp_members */
9683 0, /* tp_getset */
9684 &PyBaseObject_Type, /* tp_base */
9685 0, /* tp_dict */
9686 0, /* tp_descr_get */
9687 0, /* tp_descr_set */
9688 0, /* tp_dictoffset */
9689 0, /* tp_init */
9690 0, /* tp_alloc */
9691 unicode_new, /* tp_new */
9692 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693};
9694
9695/* Initialize the Unicode implementation */
9696
Thomas Wouters78890102000-07-22 19:25:51 +00009697void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009699 int i;
9700
Thomas Wouters477c8d52006-05-27 19:21:47 +00009701 /* XXX - move this array to unicodectype.c ? */
9702 Py_UNICODE linebreak[] = {
9703 0x000A, /* LINE FEED */
9704 0x000D, /* CARRIAGE RETURN */
9705 0x001C, /* FILE SEPARATOR */
9706 0x001D, /* GROUP SEPARATOR */
9707 0x001E, /* RECORD SEPARATOR */
9708 0x0085, /* NEXT LINE */
9709 0x2028, /* LINE SEPARATOR */
9710 0x2029, /* PARAGRAPH SEPARATOR */
9711 };
9712
Fred Drakee4315f52000-05-09 19:53:39 +00009713 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009714 free_list = NULL;
9715 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009717 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009719
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009720 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009721 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009722 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724
9725 /* initialize the linebreak bloom filter */
9726 bloom_linebreak = make_bloom_mask(
9727 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9728 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009729
9730 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731}
9732
9733/* Finalize the Unicode implementation */
9734
Christian Heimesa156e092008-02-16 07:38:31 +00009735int
9736PyUnicode_ClearFreeList(void)
9737{
9738 int freelist_size = numfree;
9739 PyUnicodeObject *u;
9740
9741 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009742 PyUnicodeObject *v = u;
9743 u = *(PyUnicodeObject **)u;
9744 if (v->str)
9745 PyObject_DEL(v->str);
9746 Py_XDECREF(v->defenc);
9747 PyObject_Del(v);
9748 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009749 }
9750 free_list = NULL;
9751 assert(numfree == 0);
9752 return freelist_size;
9753}
9754
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755void
Thomas Wouters78890102000-07-22 19:25:51 +00009756_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009758 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009760 Py_XDECREF(unicode_empty);
9761 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009762
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009763 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 if (unicode_latin1[i]) {
9765 Py_DECREF(unicode_latin1[i]);
9766 unicode_latin1[i] = NULL;
9767 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009768 }
Christian Heimesa156e092008-02-16 07:38:31 +00009769 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009771
Walter Dörwald16807132007-05-25 13:52:07 +00009772void
9773PyUnicode_InternInPlace(PyObject **p)
9774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009775 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9776 PyObject *t;
9777 if (s == NULL || !PyUnicode_Check(s))
9778 Py_FatalError(
9779 "PyUnicode_InternInPlace: unicode strings only please!");
9780 /* If it's a subclass, we don't really know what putting
9781 it in the interned dict might do. */
9782 if (!PyUnicode_CheckExact(s))
9783 return;
9784 if (PyUnicode_CHECK_INTERNED(s))
9785 return;
9786 if (interned == NULL) {
9787 interned = PyDict_New();
9788 if (interned == NULL) {
9789 PyErr_Clear(); /* Don't leave an exception */
9790 return;
9791 }
9792 }
9793 /* It might be that the GetItem call fails even
9794 though the key is present in the dictionary,
9795 namely when this happens during a stack overflow. */
9796 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009798 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009799
Benjamin Peterson29060642009-01-31 22:14:21 +00009800 if (t) {
9801 Py_INCREF(t);
9802 Py_DECREF(*p);
9803 *p = t;
9804 return;
9805 }
Walter Dörwald16807132007-05-25 13:52:07 +00009806
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807 PyThreadState_GET()->recursion_critical = 1;
9808 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9809 PyErr_Clear();
9810 PyThreadState_GET()->recursion_critical = 0;
9811 return;
9812 }
9813 PyThreadState_GET()->recursion_critical = 0;
9814 /* The two references in interned are not counted by refcnt.
9815 The deallocator will take care of this */
9816 Py_REFCNT(s) -= 2;
9817 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009818}
9819
9820void
9821PyUnicode_InternImmortal(PyObject **p)
9822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009823 PyUnicode_InternInPlace(p);
9824 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9825 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9826 Py_INCREF(*p);
9827 }
Walter Dörwald16807132007-05-25 13:52:07 +00009828}
9829
9830PyObject *
9831PyUnicode_InternFromString(const char *cp)
9832{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009833 PyObject *s = PyUnicode_FromString(cp);
9834 if (s == NULL)
9835 return NULL;
9836 PyUnicode_InternInPlace(&s);
9837 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009838}
9839
9840void _Py_ReleaseInternedUnicodeStrings(void)
9841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 PyObject *keys;
9843 PyUnicodeObject *s;
9844 Py_ssize_t i, n;
9845 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009846
Benjamin Peterson14339b62009-01-31 16:36:08 +00009847 if (interned == NULL || !PyDict_Check(interned))
9848 return;
9849 keys = PyDict_Keys(interned);
9850 if (keys == NULL || !PyList_Check(keys)) {
9851 PyErr_Clear();
9852 return;
9853 }
Walter Dörwald16807132007-05-25 13:52:07 +00009854
Benjamin Peterson14339b62009-01-31 16:36:08 +00009855 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9856 detector, interned unicode strings are not forcibly deallocated;
9857 rather, we give them their stolen references back, and then clear
9858 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009859
Benjamin Peterson14339b62009-01-31 16:36:08 +00009860 n = PyList_GET_SIZE(keys);
9861 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 for (i = 0; i < n; i++) {
9864 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9865 switch (s->state) {
9866 case SSTATE_NOT_INTERNED:
9867 /* XXX Shouldn't happen */
9868 break;
9869 case SSTATE_INTERNED_IMMORTAL:
9870 Py_REFCNT(s) += 1;
9871 immortal_size += s->length;
9872 break;
9873 case SSTATE_INTERNED_MORTAL:
9874 Py_REFCNT(s) += 2;
9875 mortal_size += s->length;
9876 break;
9877 default:
9878 Py_FatalError("Inconsistent interned string state.");
9879 }
9880 s->state = SSTATE_NOT_INTERNED;
9881 }
9882 fprintf(stderr, "total size of all interned strings: "
9883 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9884 "mortal/immortal\n", mortal_size, immortal_size);
9885 Py_DECREF(keys);
9886 PyDict_Clear(interned);
9887 Py_DECREF(interned);
9888 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009889}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009890
9891
9892/********************* Unicode Iterator **************************/
9893
9894typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 PyObject_HEAD
9896 Py_ssize_t it_index;
9897 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009898} unicodeiterobject;
9899
9900static void
9901unicodeiter_dealloc(unicodeiterobject *it)
9902{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 _PyObject_GC_UNTRACK(it);
9904 Py_XDECREF(it->it_seq);
9905 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009906}
9907
9908static int
9909unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009911 Py_VISIT(it->it_seq);
9912 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009913}
9914
9915static PyObject *
9916unicodeiter_next(unicodeiterobject *it)
9917{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 PyUnicodeObject *seq;
9919 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009920
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 assert(it != NULL);
9922 seq = it->it_seq;
9923 if (seq == NULL)
9924 return NULL;
9925 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009926
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9928 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 if (item != NULL)
9931 ++it->it_index;
9932 return item;
9933 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009934
Benjamin Peterson14339b62009-01-31 16:36:08 +00009935 Py_DECREF(seq);
9936 it->it_seq = NULL;
9937 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009938}
9939
9940static PyObject *
9941unicodeiter_len(unicodeiterobject *it)
9942{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009943 Py_ssize_t len = 0;
9944 if (it->it_seq)
9945 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9946 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009947}
9948
9949PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9950
9951static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009952 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009954 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009955};
9956
9957PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009958 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9959 "str_iterator", /* tp_name */
9960 sizeof(unicodeiterobject), /* tp_basicsize */
9961 0, /* tp_itemsize */
9962 /* methods */
9963 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9964 0, /* tp_print */
9965 0, /* tp_getattr */
9966 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009967 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009968 0, /* tp_repr */
9969 0, /* tp_as_number */
9970 0, /* tp_as_sequence */
9971 0, /* tp_as_mapping */
9972 0, /* tp_hash */
9973 0, /* tp_call */
9974 0, /* tp_str */
9975 PyObject_GenericGetAttr, /* tp_getattro */
9976 0, /* tp_setattro */
9977 0, /* tp_as_buffer */
9978 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9979 0, /* tp_doc */
9980 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9981 0, /* tp_clear */
9982 0, /* tp_richcompare */
9983 0, /* tp_weaklistoffset */
9984 PyObject_SelfIter, /* tp_iter */
9985 (iternextfunc)unicodeiter_next, /* tp_iternext */
9986 unicodeiter_methods, /* tp_methods */
9987 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009988};
9989
9990static PyObject *
9991unicode_iter(PyObject *seq)
9992{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009993 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009994
Benjamin Peterson14339b62009-01-31 16:36:08 +00009995 if (!PyUnicode_Check(seq)) {
9996 PyErr_BadInternalCall();
9997 return NULL;
9998 }
9999 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10000 if (it == NULL)
10001 return NULL;
10002 it->it_index = 0;
10003 Py_INCREF(seq);
10004 it->it_seq = (PyUnicodeObject *)seq;
10005 _PyObject_GC_TRACK(it);
10006 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010007}
10008
Martin v. Löwis5b222132007-06-10 09:51:05 +000010009size_t
10010Py_UNICODE_strlen(const Py_UNICODE *u)
10011{
10012 int res = 0;
10013 while(*u++)
10014 res++;
10015 return res;
10016}
10017
10018Py_UNICODE*
10019Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10020{
10021 Py_UNICODE *u = s1;
10022 while ((*u++ = *s2++));
10023 return s1;
10024}
10025
10026Py_UNICODE*
10027Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10028{
10029 Py_UNICODE *u = s1;
10030 while ((*u++ = *s2++))
10031 if (n-- == 0)
10032 break;
10033 return s1;
10034}
10035
10036int
10037Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10038{
10039 while (*s1 && *s2 && *s1 == *s2)
10040 s1++, s2++;
10041 if (*s1 && *s2)
10042 return (*s1 < *s2) ? -1 : +1;
10043 if (*s1)
10044 return 1;
10045 if (*s2)
10046 return -1;
10047 return 0;
10048}
10049
10050Py_UNICODE*
10051Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10052{
10053 const Py_UNICODE *p;
10054 for (p = s; *p; p++)
10055 if (*p == c)
10056 return (Py_UNICODE*)p;
10057 return NULL;
10058}
10059
10060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010061#ifdef __cplusplus
10062}
10063#endif
10064
10065
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010066/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 Local variables:
10068 c-basic-offset: 4
10069 indent-tabs-mode: nil
10070 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010071*/