blob: 18b6fa26803daf0484e1dcb38739bb7ddedee0b5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
213#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
214
Benjamin Peterson29060642009-01-31 22:14:21 +0000215#define BLOOM_LINEBREAK(ch) \
216 ((ch) < 128U ? ascii_linebreak[(ch)] : \
217 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218
219Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
220{
221 /* calculate simple bloom-style bitmask for a given unicode string */
222
223 long mask;
224 Py_ssize_t i;
225
226 mask = 0;
227 for (i = 0; i < len; i++)
228 mask |= (1 << (ptr[i] & 0x1F));
229
230 return mask;
231}
232
233Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
234{
235 Py_ssize_t i;
236
237 for (i = 0; i < setlen; i++)
238 if (set[i] == chr)
239 return 1;
240
241 return 0;
242}
243
Benjamin Peterson29060642009-01-31 22:14:21 +0000244#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
246
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247/* --- Unicode Object ----------------------------------------------------- */
248
249static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000251 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252{
253 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000254
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259 /* Resizing shared object (unicode_empty or single character
260 objects) in-place is not allowed. Use PyUnicode_Resize()
261 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 (unicode->length == 1 &&
265 unicode->str[0] < 256U &&
266 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000268 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 return -1;
270 }
271
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272 /* We allocate one more byte to make sure the string is Ux0000 terminated.
273 The overallocation is also used by fastsearch, which assumes that it's
274 safe to look at str[length] (without making any assumptions about what
275 it contains). */
276
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000278 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000279 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000281 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 PyErr_NoMemory();
283 return -1;
284 }
285 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 if (unicode->defenc) {
291 Py_DECREF(unicode->defenc);
292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 }
294 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return 0;
297}
298
299/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000300 Ux0000 terminated; some code (e.g. new_identifier)
301 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302
303 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000304 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305
306*/
307
308static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000309PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 register PyUnicodeObject *unicode;
312
Thomas Wouters477c8d52006-05-27 19:21:47 +0000313 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 if (length == 0 && unicode_empty != NULL) {
315 Py_INCREF(unicode_empty);
316 return unicode_empty;
317 }
318
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000319 /* Ensure we won't overflow the size. */
320 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
321 return (PyUnicodeObject *)PyErr_NoMemory();
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000325 if (free_list) {
326 unicode = free_list;
327 free_list = *(PyUnicodeObject **)unicode;
328 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000329 if (unicode->str) {
330 /* Keep-Alive optimization: we only upsize the buffer,
331 never downsize it. */
332 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000333 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 PyObject_DEL(unicode->str);
335 unicode->str = NULL;
336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000337 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000341 }
342 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000346 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 if (unicode == NULL)
348 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000353 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 PyErr_NoMemory();
355 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000356 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000357 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000358 * the caller fails before initializing str -- unicode_resize()
359 * reads str[0], and the Keep-Alive optimization can keep memory
360 * allocated for str alive across a call to unicode_dealloc(unicode).
361 * We don't want unicode_resize to read uninitialized memory in
362 * that case.
363 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000364 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000368 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000369 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000373 /* XXX UNREF/NEWREF interface should be more symmetrical */
374 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000375 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000376 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378}
379
380static
Guido van Rossum9475a232001-10-05 20:51:39 +0000381void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
Walter Dörwald16807132007-05-25 13:52:07 +0000383 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 case SSTATE_NOT_INTERNED:
385 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000386
Benjamin Peterson29060642009-01-31 22:14:21 +0000387 case SSTATE_INTERNED_MORTAL:
388 /* revive dead object temporarily for DelItem */
389 Py_REFCNT(unicode) = 3;
390 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
391 Py_FatalError(
392 "deletion of interned string failed");
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_IMMORTAL:
396 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 default:
399 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000400 }
401
Guido van Rossum604ddf82001-12-06 20:03:56 +0000402 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000404 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
406 PyObject_DEL(unicode->str);
407 unicode->str = NULL;
408 unicode->length = 0;
409 }
410 if (unicode->defenc) {
411 Py_DECREF(unicode->defenc);
412 unicode->defenc = NULL;
413 }
414 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000415 *(PyUnicodeObject **)unicode = free_list;
416 free_list = unicode;
417 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyObject_DEL(unicode->str);
421 Py_XDECREF(unicode->defenc);
422 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424}
425
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000426static
427int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428{
429 register PyUnicodeObject *v;
430
431 /* Argument checks */
432 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_BadInternalCall();
434 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000436 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000437 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
441
442 /* Resizing unicode_empty and single character objects is not
443 possible since these are being shared. We simply return a fresh
444 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000445 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 (v == unicode_empty || v->length == 1)) {
447 PyUnicodeObject *w = _PyUnicode_New(length);
448 if (w == NULL)
449 return -1;
450 Py_UNICODE_COPY(w->str, v->str,
451 length < v->length ? length : v->length);
452 Py_DECREF(*unicode);
453 *unicode = w;
454 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000455 }
456
457 /* Note that we don't have to modify *unicode for unshared Unicode
458 objects, since we can modify them in-place. */
459 return unicode_resize(v, length);
460}
461
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000462int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
463{
464 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
465}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000468 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469{
470 PyUnicodeObject *unicode;
471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 /* If the Unicode data is known at construction time, we can apply
473 some optimizations which share commonly used objects. */
474 if (u != NULL) {
475
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000481
482 /* Single character Unicode objects in the Latin-1 range are
483 shared when using this constructor */
484 if (size == 1 && *u < 256) {
485 unicode = unicode_latin1[*u];
486 if (!unicode) {
487 unicode = _PyUnicode_New(1);
488 if (!unicode)
489 return NULL;
490 unicode->str[0] = *u;
491 unicode_latin1[*u] = unicode;
492 }
493 Py_INCREF(unicode);
494 return (PyObject *)unicode;
495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000496 }
Tim Petersced69f82003-09-16 20:30:58 +0000497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 /* Copy the Unicode data into the new object */
503 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000504 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 return (PyObject *)unicode;
507}
508
Walter Dörwaldd2034312007-05-18 16:29:38 +0000509PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000510{
511 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000512
Benjamin Peterson14339b62009-01-31 16:36:08 +0000513 if (size < 0) {
514 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return NULL;
517 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000519 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000520 some optimizations which share commonly used objects.
521 Also, this means the input must be UTF-8, so fall back to the
522 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (u != NULL) {
524
Benjamin Peterson29060642009-01-31 22:14:21 +0000525 /* Optimization for empty strings */
526 if (size == 0 && unicode_empty != NULL) {
527 Py_INCREF(unicode_empty);
528 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000530
531 /* Single characters are shared when using this constructor.
532 Restrict to ASCII, since the input must be UTF-8. */
533 if (size == 1 && Py_CHARMASK(*u) < 128) {
534 unicode = unicode_latin1[Py_CHARMASK(*u)];
535 if (!unicode) {
536 unicode = _PyUnicode_New(1);
537 if (!unicode)
538 return NULL;
539 unicode->str[0] = Py_CHARMASK(*u);
540 unicode_latin1[Py_CHARMASK(*u)] = unicode;
541 }
542 Py_INCREF(unicode);
543 return (PyObject *)unicode;
544 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000545
546 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000547 }
548
Walter Dörwald55507312007-05-18 13:12:10 +0000549 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000550 if (!unicode)
551 return NULL;
552
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 return (PyObject *)unicode;
554}
555
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556PyObject *PyUnicode_FromString(const char *u)
557{
558 size_t size = strlen(u);
559 if (size > PY_SSIZE_T_MAX) {
560 PyErr_SetString(PyExc_OverflowError, "input too long");
561 return NULL;
562 }
563
564 return PyUnicode_FromStringAndSize(u, size);
565}
566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567#ifdef HAVE_WCHAR_H
568
Mark Dickinson081dfee2009-03-18 14:47:41 +0000569#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
570# define CONVERT_WCHAR_TO_SURROGATES
571#endif
572
573#ifdef CONVERT_WCHAR_TO_SURROGATES
574
575/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
576 to convert from UTF32 to UTF16. */
577
578PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
579 Py_ssize_t size)
580{
581 PyUnicodeObject *unicode;
582 register Py_ssize_t i;
583 Py_ssize_t alloc;
584 const wchar_t *orig_w;
585
586 if (w == NULL) {
587 if (size == 0)
588 return PyUnicode_FromStringAndSize(NULL, 0);
589 PyErr_BadInternalCall();
590 return NULL;
591 }
592
593 if (size == -1) {
594 size = wcslen(w);
595 }
596
597 alloc = size;
598 orig_w = w;
599 for (i = size; i > 0; i--) {
600 if (*w > 0xFFFF)
601 alloc++;
602 w++;
603 }
604 w = orig_w;
605 unicode = _PyUnicode_New(alloc);
606 if (!unicode)
607 return NULL;
608
609 /* Copy the wchar_t data into the new object */
610 {
611 register Py_UNICODE *u;
612 u = PyUnicode_AS_UNICODE(unicode);
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF) {
615 wchar_t ordinal = *w++;
616 ordinal -= 0x10000;
617 *u++ = 0xD800 | (ordinal >> 10);
618 *u++ = 0xDC00 | (ordinal & 0x3FF);
619 }
620 else
621 *u++ = *w++;
622 }
623 }
624 return (PyObject *)unicode;
625}
626
627#else
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000630 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631{
632 PyUnicodeObject *unicode;
633
634 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000635 if (size == 0)
636 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 PyErr_BadInternalCall();
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == -1) {
642 size = wcslen(w);
643 }
644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 unicode = _PyUnicode_New(size);
646 if (!unicode)
647 return NULL;
648
649 /* Copy the wchar_t data into the new object */
650#ifdef HAVE_USABLE_WCHAR_T
651 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000652#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000654 register Py_UNICODE *u;
655 register Py_ssize_t i;
656 u = PyUnicode_AS_UNICODE(unicode);
657 for (i = size; i > 0; i--)
658 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 }
660#endif
661
662 return (PyObject *)unicode;
663}
664
Mark Dickinson081dfee2009-03-18 14:47:41 +0000665#endif /* CONVERT_WCHAR_TO_SURROGATES */
666
667#undef CONVERT_WCHAR_TO_SURROGATES
668
Walter Dörwald346737f2007-05-31 10:44:43 +0000669static void
670makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
671{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000672 *fmt++ = '%';
673 if (width) {
674 if (zeropad)
675 *fmt++ = '0';
676 fmt += sprintf(fmt, "%d", width);
677 }
678 if (precision)
679 fmt += sprintf(fmt, ".%d", precision);
680 if (longflag)
681 *fmt++ = 'l';
682 else if (size_tflag) {
683 char *f = PY_FORMAT_SIZE_T;
684 while (*f)
685 *fmt++ = *f++;
686 }
687 *fmt++ = c;
688 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000689}
690
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
692
693PyObject *
694PyUnicode_FromFormatV(const char *format, va_list vargs)
695{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000696 va_list count;
697 Py_ssize_t callcount = 0;
698 PyObject **callresults = NULL;
699 PyObject **callresult = NULL;
700 Py_ssize_t n = 0;
701 int width = 0;
702 int precision = 0;
703 int zeropad;
704 const char* f;
705 Py_UNICODE *s;
706 PyObject *string;
707 /* used by sprintf */
708 char buffer[21];
709 /* use abuffer instead of buffer, if we need more space
710 * (which can happen if there's a format specifier with width). */
711 char *abuffer = NULL;
712 char *realbuffer;
713 Py_ssize_t abuffersize = 0;
714 char fmt[60]; /* should be enough for %0width.precisionld */
715 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716
717#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000718 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#else
720#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000723 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#endif
725#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000726 /* step 1: count the number of %S/%R/%A/%s format specifications
727 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
728 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
729 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000730 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000731 if (*f == '%') {
732 if (*(f+1)=='%')
733 continue;
734 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
735 ++callcount;
736 while (ISDIGIT((unsigned)*f))
737 width = (width*10) + *f++ - '0';
738 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
739 ;
740 if (*f == 's')
741 ++callcount;
742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 }
744 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000745 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 if (callcount) {
747 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
748 if (!callresults) {
749 PyErr_NoMemory();
750 return NULL;
751 }
752 callresult = callresults;
753 }
754 /* step 3: figure out how large a buffer we need */
755 for (f = format; *f; f++) {
756 if (*f == '%') {
757 const char* p = f;
758 width = 0;
759 while (ISDIGIT((unsigned)*f))
760 width = (width*10) + *f++ - '0';
761 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
762 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
765 * they don't affect the amount of space we reserve.
766 */
767 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000768 (f[1] == 'd' || f[1] == 'u'))
769 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 switch (*f) {
772 case 'c':
773 (void)va_arg(count, int);
774 /* fall through... */
775 case '%':
776 n++;
777 break;
778 case 'd': case 'u': case 'i': case 'x':
779 (void) va_arg(count, int);
780 /* 20 bytes is enough to hold a 64-bit
781 integer. Decimal takes the most space.
782 This isn't enough for octal.
783 If a width is specified we need more
784 (which we allocate later). */
785 if (width < 20)
786 width = 20;
787 n += width;
788 if (abuffersize < width)
789 abuffersize = width;
790 break;
791 case 's':
792 {
793 /* UTF-8 */
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000794 unsigned char *s = va_arg(count, unsigned char*);
795 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
796 if (!str)
797 goto fail;
798 n += PyUnicode_GET_SIZE(str);
799 /* Remember the str and switch to the next slot */
800 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000801 break;
802 }
803 case 'U':
804 {
805 PyObject *obj = va_arg(count, PyObject *);
806 assert(obj && PyUnicode_Check(obj));
807 n += PyUnicode_GET_SIZE(obj);
808 break;
809 }
810 case 'V':
811 {
812 PyObject *obj = va_arg(count, PyObject *);
813 const char *str = va_arg(count, const char *);
814 assert(obj || str);
815 assert(!obj || PyUnicode_Check(obj));
816 if (obj)
817 n += PyUnicode_GET_SIZE(obj);
818 else
819 n += strlen(str);
820 break;
821 }
822 case 'S':
823 {
824 PyObject *obj = va_arg(count, PyObject *);
825 PyObject *str;
826 assert(obj);
827 str = PyObject_Str(obj);
828 if (!str)
829 goto fail;
830 n += PyUnicode_GET_SIZE(str);
831 /* Remember the str and switch to the next slot */
832 *callresult++ = str;
833 break;
834 }
835 case 'R':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 PyObject *repr;
839 assert(obj);
840 repr = PyObject_Repr(obj);
841 if (!repr)
842 goto fail;
843 n += PyUnicode_GET_SIZE(repr);
844 /* Remember the repr and switch to the next slot */
845 *callresult++ = repr;
846 break;
847 }
848 case 'A':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *ascii;
852 assert(obj);
853 ascii = PyObject_ASCII(obj);
854 if (!ascii)
855 goto fail;
856 n += PyUnicode_GET_SIZE(ascii);
857 /* Remember the repr and switch to the next slot */
858 *callresult++ = ascii;
859 break;
860 }
861 case 'p':
862 (void) va_arg(count, int);
863 /* maximum 64-bit pointer representation:
864 * 0xffffffffffffffff
865 * so 19 characters is enough.
866 * XXX I count 18 -- what's the extra for?
867 */
868 n += 19;
869 break;
870 default:
871 /* if we stumble upon an unknown
872 formatting code, copy the rest of
873 the format string to the output
874 string. (we cannot just skip the
875 code, since there's no way to know
876 what's in the argument list) */
877 n += strlen(p);
878 goto expand;
879 }
880 } else
881 n++;
882 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000883 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000884 if (abuffersize > 20) {
885 abuffer = PyObject_Malloc(abuffersize);
886 if (!abuffer) {
887 PyErr_NoMemory();
888 goto fail;
889 }
890 realbuffer = abuffer;
891 }
892 else
893 realbuffer = buffer;
894 /* step 4: fill the buffer */
895 /* Since we've analyzed how much space we need for the worst case,
896 we don't have to resize the string.
897 There can be no errors beyond this point. */
898 string = PyUnicode_FromUnicode(NULL, n);
899 if (!string)
900 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000901
Benjamin Peterson14339b62009-01-31 16:36:08 +0000902 s = PyUnicode_AS_UNICODE(string);
903 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904
Benjamin Peterson14339b62009-01-31 16:36:08 +0000905 for (f = format; *f; f++) {
906 if (*f == '%') {
907 const char* p = f++;
908 int longflag = 0;
909 int size_tflag = 0;
910 zeropad = (*f == '0');
911 /* parse the width.precision part */
912 width = 0;
913 while (ISDIGIT((unsigned)*f))
914 width = (width*10) + *f++ - '0';
915 precision = 0;
916 if (*f == '.') {
917 f++;
918 while (ISDIGIT((unsigned)*f))
919 precision = (precision*10) + *f++ - '0';
920 }
921 /* handle the long flag, but only for %ld and %lu.
922 others can be added when necessary. */
923 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
924 longflag = 1;
925 ++f;
926 }
927 /* handle the size_t flag. */
928 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
929 size_tflag = 1;
930 ++f;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 switch (*f) {
934 case 'c':
935 *s++ = va_arg(vargs, int);
936 break;
937 case 'd':
938 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
939 if (longflag)
940 sprintf(realbuffer, fmt, va_arg(vargs, long));
941 else if (size_tflag)
942 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
943 else
944 sprintf(realbuffer, fmt, va_arg(vargs, int));
945 appendstring(realbuffer);
946 break;
947 case 'u':
948 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
949 if (longflag)
950 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
951 else if (size_tflag)
952 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
953 else
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
955 appendstring(realbuffer);
956 break;
957 case 'i':
958 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
959 sprintf(realbuffer, fmt, va_arg(vargs, int));
960 appendstring(realbuffer);
961 break;
962 case 'x':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 's':
968 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000969 /* unused, since we already have the result */
970 (void) va_arg(vargs, char *);
971 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
972 PyUnicode_GET_SIZE(*callresult));
973 s += PyUnicode_GET_SIZE(*callresult);
974 /* We're done with the unicode()/repr() => forget it */
975 Py_DECREF(*callresult);
976 /* switch to next unicode()/repr() result */
977 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000978 break;
979 }
980 case 'U':
981 {
982 PyObject *obj = va_arg(vargs, PyObject *);
983 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
985 s += size;
986 break;
987 }
988 case 'V':
989 {
990 PyObject *obj = va_arg(vargs, PyObject *);
991 const char *str = va_arg(vargs, const char *);
992 if (obj) {
993 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
994 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
995 s += size;
996 } else {
997 appendstring(str);
998 }
999 break;
1000 }
1001 case 'S':
1002 case 'R':
1003 {
1004 Py_UNICODE *ucopy;
1005 Py_ssize_t usize;
1006 Py_ssize_t upos;
1007 /* unused, since we already have the result */
1008 (void) va_arg(vargs, PyObject *);
1009 ucopy = PyUnicode_AS_UNICODE(*callresult);
1010 usize = PyUnicode_GET_SIZE(*callresult);
1011 for (upos = 0; upos<usize;)
1012 *s++ = ucopy[upos++];
1013 /* We're done with the unicode()/repr() => forget it */
1014 Py_DECREF(*callresult);
1015 /* switch to next unicode()/repr() result */
1016 ++callresult;
1017 break;
1018 }
1019 case 'p':
1020 sprintf(buffer, "%p", va_arg(vargs, void*));
1021 /* %p is ill-defined: ensure leading 0x. */
1022 if (buffer[1] == 'X')
1023 buffer[1] = 'x';
1024 else if (buffer[1] != 'x') {
1025 memmove(buffer+2, buffer, strlen(buffer)+1);
1026 buffer[0] = '0';
1027 buffer[1] = 'x';
1028 }
1029 appendstring(buffer);
1030 break;
1031 case '%':
1032 *s++ = '%';
1033 break;
1034 default:
1035 appendstring(p);
1036 goto end;
1037 }
1038 } else
1039 *s++ = *f;
1040 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001041
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (callresults)
1044 PyObject_Free(callresults);
1045 if (abuffer)
1046 PyObject_Free(abuffer);
1047 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1048 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001049 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (callresults) {
1051 PyObject **callresult2 = callresults;
1052 while (callresult2 < callresult) {
1053 Py_DECREF(*callresult2);
1054 ++callresult2;
1055 }
1056 PyObject_Free(callresults);
1057 }
1058 if (abuffer)
1059 PyObject_Free(abuffer);
1060 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001061}
1062
1063#undef appendstring
1064
1065PyObject *
1066PyUnicode_FromFormat(const char *format, ...)
1067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 PyObject* ret;
1069 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001070
1071#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001073#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 ret = PyUnicode_FromFormatV(format, vargs);
1077 va_end(vargs);
1078 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079}
1080
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 wchar_t *w,
1083 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
1085 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 PyErr_BadInternalCall();
1087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001089
1090 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094#ifdef HAVE_USABLE_WCHAR_T
1095 memcpy(w, unicode->str, size * sizeof(wchar_t));
1096#else
1097 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 register Py_UNICODE *u;
1099 register Py_ssize_t i;
1100 u = PyUnicode_AS_UNICODE(unicode);
1101 for (i = size; i > 0; i--)
1102 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104#endif
1105
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001106 if (size > PyUnicode_GET_SIZE(unicode))
1107 return PyUnicode_GET_SIZE(unicode);
1108 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110}
1111
1112#endif
1113
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001114PyObject *PyUnicode_FromOrdinal(int ordinal)
1115{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001116 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001119 PyErr_SetString(PyExc_ValueError,
1120 "chr() arg not in range(0x110000)");
1121 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001123
1124#ifndef Py_UNICODE_WIDE
1125 if (ordinal > 0xffff) {
1126 ordinal -= 0x10000;
1127 s[0] = 0xD800 | (ordinal >> 10);
1128 s[1] = 0xDC00 | (ordinal & 0x3FF);
1129 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001130 }
1131#endif
1132
Hye-Shik Chang40574832004-04-06 07:24:51 +00001133 s[0] = (Py_UNICODE)ordinal;
1134 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001135}
1136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137PyObject *PyUnicode_FromObject(register PyObject *obj)
1138{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001140 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001141 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001142 Py_INCREF(obj);
1143 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001144 }
1145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 /* For a Unicode subtype that's not a Unicode object,
1147 return a true Unicode object with the same data. */
1148 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1149 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001151 PyErr_Format(PyExc_TypeError,
1152 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001153 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001154 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001155}
1156
1157PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 const char *encoding,
1159 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001160{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001161 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001170 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 PyErr_SetString(PyExc_TypeError,
1172 "decoding str is not supported");
1173 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001175
1176 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001177 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001178 s = PyBytes_AS_STRING(obj);
1179 len = PyBytes_GET_SIZE(obj);
1180 }
1181 else if (PyByteArray_Check(obj)) {
1182 s = PyByteArray_AS_STRING(obj);
1183 len = PyByteArray_GET_SIZE(obj);
1184 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001185 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 /* Overwrite the error message with something more useful in
1187 case of a TypeError. */
1188 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 "coercing to str: need string or buffer, "
1191 "%.80s found",
1192 Py_TYPE(obj)->tp_name);
1193 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001194 }
Tim Petersced69f82003-09-16 20:30:58 +00001195
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001196 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 Py_INCREF(unicode_empty);
1199 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001202 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001203
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001204 return v;
1205
Benjamin Peterson29060642009-01-31 22:14:21 +00001206 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001211 Py_ssize_t size,
1212 const char *encoding,
1213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
1215 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001216 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001217 char lower[20]; /* Enough for any encoding name we recognize */
1218 char *l;
1219 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220
1221 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001222 encoding = PyUnicode_GetDefaultEncoding();
1223
1224 /* Convert encoding to lower case and replace '_' with '-' in order to
1225 catch e.g. UTF_8 */
1226 e = encoding;
1227 l = lower;
1228 while (*e && l < &lower[(sizeof lower) - 2]) {
1229 if (ISUPPER(*e)) {
1230 *l++ = TOLOWER(*e++);
1231 }
1232 else if (*e == '_') {
1233 *l++ = '-';
1234 e++;
1235 }
1236 else {
1237 *l++ = *e++;
1238 }
1239 }
1240 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001243 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001245 else if ((strcmp(lower, "latin-1") == 0) ||
1246 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001247 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001250 return PyUnicode_DecodeMBCS(s, size, errors);
1251#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001254 else if (strcmp(lower, "utf-16") == 0)
1255 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1256 else if (strcmp(lower, "utf-32") == 0)
1257 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258
1259 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001260 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001261 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001262 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001263 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (buffer == NULL)
1265 goto onError;
1266 unicode = PyCodec_Decode(buffer, encoding, errors);
1267 if (unicode == NULL)
1268 goto onError;
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001271 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001272 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 Py_DECREF(unicode);
1274 goto onError;
1275 }
1276 Py_DECREF(buffer);
1277 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001278
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 Py_XDECREF(buffer);
1281 return NULL;
1282}
1283
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1285 const char *encoding,
1286 const char *errors)
1287{
1288 PyObject *v;
1289
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294
1295 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001296 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297
1298 /* Decode via the codec registry */
1299 v = PyCodec_Decode(unicode, encoding, errors);
1300 if (v == NULL)
1301 goto onError;
1302 return v;
1303
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001305 return NULL;
1306}
1307
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001308PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1309 const char *encoding,
1310 const char *errors)
1311{
1312 PyObject *v;
1313
1314 if (!PyUnicode_Check(unicode)) {
1315 PyErr_BadArgument();
1316 goto onError;
1317 }
1318
1319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001321
1322 /* Decode via the codec registry */
1323 v = PyCodec_Decode(unicode, encoding, errors);
1324 if (v == NULL)
1325 goto onError;
1326 if (!PyUnicode_Check(v)) {
1327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001328 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001329 Py_TYPE(v)->tp_name);
1330 Py_DECREF(v);
1331 goto onError;
1332 }
1333 return v;
1334
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 Py_ssize_t size,
1341 const char *encoding,
1342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343{
1344 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 unicode = PyUnicode_FromUnicode(s, size);
1347 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1350 Py_DECREF(unicode);
1351 return v;
1352}
1353
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001354PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *v;
1359
1360 if (!PyUnicode_Check(unicode)) {
1361 PyErr_BadArgument();
1362 goto onError;
1363 }
1364
1365 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001367
1368 /* Encode via the codec registry */
1369 v = PyCodec_Encode(unicode, encoding, errors);
1370 if (v == NULL)
1371 goto onError;
1372 return v;
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375 return NULL;
1376}
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1379 const char *encoding,
1380 const char *errors)
1381{
1382 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 if (!PyUnicode_Check(unicode)) {
1385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
Fred Drakee4315f52000-05-09 19:53:39 +00001388
Tim Petersced69f82003-09-16 20:30:58 +00001389 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001391
1392 /* Shortcuts for common default encodings */
1393 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 if (strcmp(encoding, "utf-8") == 0)
1395 return PyUnicode_AsUTF8String(unicode);
1396 else if (strcmp(encoding, "latin-1") == 0)
1397 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001398#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 else if (strcmp(encoding, "mbcs") == 0)
1400 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "ascii") == 0)
1403 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001404 /* During bootstrap, we may need to find the encodings
1405 package, to load the file system encoding, and require the
1406 file system encoding in order to load the encodings
1407 package.
1408
1409 Break out of this dependency by assuming that the path to
1410 the encodings module is ASCII-only. XXX could try wcstombs
1411 instead, if the file system encoding is the locale's
1412 encoding. */
1413 else if (Py_FileSystemDefaultEncoding &&
1414 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1415 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
1419 /* Encode via the codec registry */
1420 v = PyCodec_Encode(unicode, encoding, errors);
1421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001422 return NULL;
1423
1424 /* The normal path */
1425 if (PyBytes_Check(v))
1426 return v;
1427
1428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001429 if (PyByteArray_Check(v)) {
1430 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001431 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 PyOS_snprintf(msg, sizeof(msg),
1433 "encoder %s returned buffer instead of bytes",
1434 encoding);
1435 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001436 Py_DECREF(v);
1437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1441 Py_DECREF(v);
1442 return b;
1443 }
1444
1445 PyErr_Format(PyExc_TypeError,
1446 "encoder did not return a bytes object (type=%.400s)",
1447 Py_TYPE(v)->tp_name);
1448 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001449 return NULL;
1450}
1451
1452PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1453 const char *encoding,
1454 const char *errors)
1455{
1456 PyObject *v;
1457
1458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 goto onError;
1461 }
1462
1463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001465
1466 /* Encode via the codec registry */
1467 v = PyCodec_Encode(unicode, encoding, errors);
1468 if (v == NULL)
1469 goto onError;
1470 if (!PyUnicode_Check(v)) {
1471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001473 Py_TYPE(v)->tp_name);
1474 Py_DECREF(v);
1475 goto onError;
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return NULL;
1481}
1482
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001483PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001485{
1486 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001487 if (v)
1488 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 if (errors != NULL)
1490 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001491 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001492 PyUnicode_GET_SIZE(unicode),
1493 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001496 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001497 return v;
1498}
1499
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001500PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001501PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001502 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001503 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1504}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505
Christian Heimes5894ba72007-11-04 11:43:14 +00001506PyObject*
1507PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1508{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001509 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1510 can be undefined. If it is case, decode using UTF-8. The following assumes
1511 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1512 bootstrapping process where the codecs aren't ready yet.
1513 */
1514 if (Py_FileSystemDefaultEncoding) {
1515#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001516 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001517 return PyUnicode_DecodeMBCS(s, size, "replace");
1518 }
1519#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001520 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001521 return PyUnicode_DecodeUTF8(s, size, "replace");
1522 }
1523#endif
1524 return PyUnicode_Decode(s, size,
1525 Py_FileSystemDefaultEncoding,
1526 "replace");
1527 }
1528 else {
1529 return PyUnicode_DecodeUTF8(s, size, "replace");
1530 }
1531}
1532
Martin v. Löwis5b222132007-06-10 09:51:05 +00001533char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001534_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001535{
Christian Heimesf3863112007-11-22 07:46:41 +00001536 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001537 if (!PyUnicode_Check(unicode)) {
1538 PyErr_BadArgument();
1539 return NULL;
1540 }
Christian Heimesf3863112007-11-22 07:46:41 +00001541 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1542 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001543 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001544 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001545 *psize = PyBytes_GET_SIZE(bytes);
1546 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001547}
1548
1549char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001550_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001551{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001552 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001553}
1554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1556{
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561 return PyUnicode_AS_UNICODE(unicode);
1562
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 return NULL;
1565}
1566
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
1569 if (!PyUnicode_Check(unicode)) {
1570 PyErr_BadArgument();
1571 goto onError;
1572 }
1573 return PyUnicode_GET_SIZE(unicode);
1574
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 return -1;
1577}
1578
Thomas Wouters78890102000-07-22 19:25:51 +00001579const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001580{
1581 return unicode_default_encoding;
1582}
1583
1584int PyUnicode_SetDefaultEncoding(const char *encoding)
1585{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001586 if (strcmp(encoding, unicode_default_encoding) != 0) {
1587 PyErr_Format(PyExc_ValueError,
1588 "Can only set default encoding to %s",
1589 unicode_default_encoding);
1590 return -1;
1591 }
Fred Drakee4315f52000-05-09 19:53:39 +00001592 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001593}
1594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001595/* error handling callback helper:
1596 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001597 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 and adjust various state variables.
1599 return 0 on success, -1 on error
1600*/
1601
1602static
1603int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 const char *encoding, const char *reason,
1605 const char **input, const char **inend, Py_ssize_t *startinpos,
1606 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1607 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001609 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610
1611 PyObject *restuple = NULL;
1612 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001613 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001615 Py_ssize_t requiredsize;
1616 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001618 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001619 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 int res = -1;
1621
1622 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001623 *errorHandler = PyCodec_LookupError(errors);
1624 if (*errorHandler == NULL)
1625 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 }
1627
1628 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001629 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001630 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1631 if (*exceptionObject == NULL)
1632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 }
1634 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001635 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1636 goto onError;
1637 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1638 goto onError;
1639 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1640 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001641 }
1642
1643 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1644 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001645 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001647 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 }
1650 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001652
1653 /* Copy back the bytes variables, which might have been modified by the
1654 callback */
1655 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1656 if (!inputobj)
1657 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001658 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001660 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001661 *input = PyBytes_AS_STRING(inputobj);
1662 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001663 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001664 /* we can DECREF safely, as the exception has another reference,
1665 so the object won't go away. */
1666 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001670 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1672 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001673 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674
1675 /* need more space? (at least enough for what we
1676 have+the replacement+the rest of the string (starting
1677 at the new input position), so we won't have to check space
1678 when there are no errors in the rest of the string) */
1679 repptr = PyUnicode_AS_UNICODE(repunicode);
1680 repsize = PyUnicode_GET_SIZE(repunicode);
1681 requiredsize = *outpos + repsize + insize-newpos;
1682 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001683 if (requiredsize<2*outsize)
1684 requiredsize = 2*outsize;
1685 if (_PyUnicode_Resize(output, requiredsize) < 0)
1686 goto onError;
1687 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 }
1689 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001690 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 Py_UNICODE_COPY(*outptr, repptr, repsize);
1692 *outptr += repsize;
1693 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 /* we made it! */
1696 res = 0;
1697
Benjamin Peterson29060642009-01-31 22:14:21 +00001698 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 Py_XDECREF(restuple);
1700 return res;
1701}
1702
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703/* --- UTF-7 Codec -------------------------------------------------------- */
1704
Antoine Pitrou244651a2009-05-04 18:56:13 +00001705/* See RFC2152 for details. We encode conservatively and decode liberally. */
1706
1707/* Three simple macros defining base-64. */
1708
1709/* Is c a base-64 character? */
1710
1711#define IS_BASE64(c) \
1712 (((c) >= 'A' && (c) <= 'Z') || \
1713 ((c) >= 'a' && (c) <= 'z') || \
1714 ((c) >= '0' && (c) <= '9') || \
1715 (c) == '+' || (c) == '/')
1716
1717/* given that c is a base-64 character, what is its base-64 value? */
1718
1719#define FROM_BASE64(c) \
1720 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1721 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1722 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1723 (c) == '+' ? 62 : 63)
1724
1725/* What is the base-64 character of the bottom 6 bits of n? */
1726
1727#define TO_BASE64(n) \
1728 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1729
1730/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1731 * decoded as itself. We are permissive on decoding; the only ASCII
1732 * byte not decoding to itself is the + which begins a base64
1733 * string. */
1734
1735#define DECODE_DIRECT(c) \
1736 ((c) <= 127 && (c) != '+')
1737
1738/* The UTF-7 encoder treats ASCII characters differently according to
1739 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1740 * the above). See RFC2152. This array identifies these different
1741 * sets:
1742 * 0 : "Set D"
1743 * alphanumeric and '(),-./:?
1744 * 1 : "Set O"
1745 * !"#$%&*;<=>@[]^_`{|}
1746 * 2 : "whitespace"
1747 * ht nl cr sp
1748 * 3 : special (must be base64 encoded)
1749 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1750 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
Tim Petersced69f82003-09-16 20:30:58 +00001752static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001753char utf7_category[128] = {
1754/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1755 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1756/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1757 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1758/* sp ! " # $ % & ' ( ) * + , - . / */
1759 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1760/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1762/* @ A B C D E F G H I J K L M N O */
1763 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1766/* ` a b c d e f g h i j k l m n o */
1767 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1768/* p q r s t u v w x y z { | } ~ del */
1769 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770};
1771
Antoine Pitrou244651a2009-05-04 18:56:13 +00001772/* ENCODE_DIRECT: this character should be encoded as itself. The
1773 * answer depends on whether we are encoding set O as itself, and also
1774 * on whether we are encoding whitespace as itself. RFC2152 makes it
1775 * clear that the answers to these questions vary between
1776 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001777
Antoine Pitrou244651a2009-05-04 18:56:13 +00001778#define ENCODE_DIRECT(c, directO, directWS) \
1779 ((c) < 128 && (c) > 0 && \
1780 ((utf7_category[(c)] == 0) || \
1781 (directWS && (utf7_category[(c)] == 2)) || \
1782 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001784PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 Py_ssize_t size,
1786 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001788 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1789}
1790
Antoine Pitrou244651a2009-05-04 18:56:13 +00001791/* The decoder. The only state we preserve is our read position,
1792 * i.e. how many characters we have consumed. So if we end in the
1793 * middle of a shift sequence we have to back off the read position
1794 * and the output to the beginning of the sequence, otherwise we lose
1795 * all the shift state (seen bits, number of bits seen, high
1796 * surrogate). */
1797
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001798PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 Py_ssize_t size,
1800 const char *errors,
1801 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001802{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001804 Py_ssize_t startinpos;
1805 Py_ssize_t endinpos;
1806 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 const char *e;
1808 PyUnicodeObject *unicode;
1809 Py_UNICODE *p;
1810 const char *errmsg = "";
1811 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001812 Py_UNICODE *shiftOutStart;
1813 unsigned int base64bits = 0;
1814 unsigned long base64buffer = 0;
1815 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 PyObject *errorHandler = NULL;
1817 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818
1819 unicode = _PyUnicode_New(size);
1820 if (!unicode)
1821 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001822 if (size == 0) {
1823 if (consumed)
1824 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001825 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001826 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827
1828 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001829 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 e = s + size;
1831
1832 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001835 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001836
Antoine Pitrou244651a2009-05-04 18:56:13 +00001837 if (inShift) { /* in a base-64 section */
1838 if (IS_BASE64(ch)) { /* consume a base-64 character */
1839 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1840 base64bits += 6;
1841 s++;
1842 if (base64bits >= 16) {
1843 /* we have enough bits for a UTF-16 value */
1844 Py_UNICODE outCh = (Py_UNICODE)
1845 (base64buffer >> (base64bits-16));
1846 base64bits -= 16;
1847 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1848 if (surrogate) {
1849 /* expecting a second surrogate */
1850 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1851#ifdef Py_UNICODE_WIDE
1852 *p++ = (((surrogate & 0x3FF)<<10)
1853 | (outCh & 0x3FF)) + 0x10000;
1854#else
1855 *p++ = surrogate;
1856 *p++ = outCh;
1857#endif
1858 surrogate = 0;
1859 }
1860 else {
1861 surrogate = 0;
1862 errmsg = "second surrogate missing";
1863 goto utf7Error;
1864 }
1865 }
1866 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1867 /* first surrogate */
1868 surrogate = outCh;
1869 }
1870 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1871 errmsg = "unexpected second surrogate";
1872 goto utf7Error;
1873 }
1874 else {
1875 *p++ = outCh;
1876 }
1877 }
1878 }
1879 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880 inShift = 0;
1881 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001882 if (surrogate) {
1883 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001884 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001885 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001886 if (base64bits > 0) { /* left-over bits */
1887 if (base64bits >= 6) {
1888 /* We've seen at least one base-64 character */
1889 errmsg = "partial character in shift sequence";
1890 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001891 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001892 else {
1893 /* Some bits remain; they should be zero */
1894 if (base64buffer != 0) {
1895 errmsg = "non-zero padding bits in shift sequence";
1896 goto utf7Error;
1897 }
1898 }
1899 }
1900 if (ch != '-') {
1901 /* '-' is absorbed; other terminating
1902 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *p++ = ch;
1904 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001905 }
1906 }
1907 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001909 s++; /* consume '+' */
1910 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001911 s++;
1912 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001913 }
1914 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001916 shiftOutStart = p;
1917 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001918 }
1919 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001920 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001921 *p++ = ch;
1922 s++;
1923 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001924 else {
1925 startinpos = s-starts;
1926 s++;
1927 errmsg = "unexpected special character";
1928 goto utf7Error;
1929 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001930 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001931utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 outpos = p-PyUnicode_AS_UNICODE(unicode);
1933 endinpos = s-starts;
1934 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001935 errors, &errorHandler,
1936 "utf7", errmsg,
1937 &starts, &e, &startinpos, &endinpos, &exc, &s,
1938 &unicode, &outpos, &p))
1939 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001940 }
1941
Antoine Pitrou244651a2009-05-04 18:56:13 +00001942 /* end of string */
1943
1944 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1945 /* if we're in an inconsistent state, that's an error */
1946 if (surrogate ||
1947 (base64bits >= 6) ||
1948 (base64bits > 0 && base64buffer != 0)) {
1949 outpos = p-PyUnicode_AS_UNICODE(unicode);
1950 endinpos = size;
1951 if (unicode_decode_call_errorhandler(
1952 errors, &errorHandler,
1953 "utf7", "unterminated shift sequence",
1954 &starts, &e, &startinpos, &endinpos, &exc, &s,
1955 &unicode, &outpos, &p))
1956 goto onError;
1957 if (s < e)
1958 goto restart;
1959 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001960 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001961
1962 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001963 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964 if (inShift) {
1965 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001966 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 }
1968 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001969 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001970 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001971 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001973 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001974 goto onError;
1975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001976 Py_XDECREF(errorHandler);
1977 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978 return (PyObject *)unicode;
1979
Benjamin Peterson29060642009-01-31 22:14:21 +00001980 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 Py_XDECREF(errorHandler);
1982 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001983 Py_DECREF(unicode);
1984 return NULL;
1985}
1986
1987
1988PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001989 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00001990 int base64SetO,
1991 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001994 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001995 /* It might be possible to tighten this worst case */
Antoine Pitrou244651a2009-05-04 18:56:13 +00001996 Py_ssize_t allocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001997 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001998 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001999 unsigned int base64bits = 0;
2000 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002001 char * out;
2002 char * start;
2003
2004 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002005 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002006
Antoine Pitrou244651a2009-05-04 18:56:13 +00002007 if (allocated / 5 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002008 return PyErr_NoMemory();
2009
Antoine Pitrou244651a2009-05-04 18:56:13 +00002010 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002011 if (v == NULL)
2012 return NULL;
2013
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002014 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002015 for (;i < size; ++i) {
2016 Py_UNICODE ch = s[i];
2017
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 if (inShift) {
2019 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2020 /* shifting out */
2021 if (base64bits) { /* output remaining bits */
2022 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2023 base64buffer = 0;
2024 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002025 }
2026 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002027 /* Characters not in the BASE64 set implicitly unshift the sequence
2028 so no '-' is required, except if the character is itself a '-' */
2029 if (IS_BASE64(ch) || ch == '-') {
2030 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002031 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 *out++ = (char) ch;
2033 }
2034 else {
2035 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002036 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002038 else { /* not in a shift sequence */
2039 if (ch == '+') {
2040 *out++ = '+';
2041 *out++ = '-';
2042 }
2043 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2044 *out++ = (char) ch;
2045 }
2046 else {
2047 *out++ = '+';
2048 inShift = 1;
2049 goto encode_char;
2050 }
2051 }
2052 continue;
2053encode_char:
2054#ifdef Py_UNICODE_WIDE
2055 if (ch >= 0x10000) {
2056 /* code first surrogate */
2057 base64bits += 16;
2058 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2059 while (base64bits >= 6) {
2060 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2061 base64bits -= 6;
2062 }
2063 /* prepare second surrogate */
2064 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2065 }
2066#endif
2067 base64bits += 16;
2068 base64buffer = (base64buffer << 16) | ch;
2069 while (base64bits >= 6) {
2070 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2071 base64bits -= 6;
2072 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002073 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 if (base64bits)
2075 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2076 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002077 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002078 if (_PyBytes_Resize(&v, out - start) < 0)
2079 return NULL;
2080 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002081}
2082
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083#undef IS_BASE64
2084#undef FROM_BASE64
2085#undef TO_BASE64
2086#undef DECODE_DIRECT
2087#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089/* --- UTF-8 Codec -------------------------------------------------------- */
2090
Tim Petersced69f82003-09-16 20:30:58 +00002091static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092char utf8_code_length[256] = {
2093 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2094 illegal prefix. see RFC 2279 for details */
2095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2101 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2108 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2109 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2110 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2111};
2112
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 Py_ssize_t size,
2115 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116{
Walter Dörwald69652032004-09-07 20:24:22 +00002117 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2118}
2119
Antoine Pitrouab868312009-01-10 15:40:25 +00002120/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2121#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2122
2123/* Mask to quickly check whether a C 'long' contains a
2124 non-ASCII, UTF8-encoded char. */
2125#if (SIZEOF_LONG == 8)
2126# define ASCII_CHAR_MASK 0x8080808080808080L
2127#elif (SIZEOF_LONG == 4)
2128# define ASCII_CHAR_MASK 0x80808080L
2129#else
2130# error C 'long' size should be either 4 or 8!
2131#endif
2132
Walter Dörwald69652032004-09-07 20:24:22 +00002133PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002134 Py_ssize_t size,
2135 const char *errors,
2136 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002140 Py_ssize_t startinpos;
2141 Py_ssize_t endinpos;
2142 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002143 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 PyUnicodeObject *unicode;
2145 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002146 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 PyObject *errorHandler = NULL;
2148 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149
2150 /* Note: size will always be longer than the resulting Unicode
2151 character count */
2152 unicode = _PyUnicode_New(size);
2153 if (!unicode)
2154 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002155 if (size == 0) {
2156 if (consumed)
2157 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160
2161 /* Unpack UTF-8 encoded data */
2162 p = unicode->str;
2163 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002164 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165
2166 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002167 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168
2169 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002170 /* Fast path for runs of ASCII characters. Given that common UTF-8
2171 input will consist of an overwhelming majority of ASCII
2172 characters, we try to optimize for this case by checking
2173 as many characters as a C 'long' can contain.
2174 First, check if we can do an aligned read, as most CPUs have
2175 a penalty for unaligned reads.
2176 */
2177 if (!((size_t) s & LONG_PTR_MASK)) {
2178 /* Help register allocation */
2179 register const char *_s = s;
2180 register Py_UNICODE *_p = p;
2181 while (_s < aligned_end) {
2182 /* Read a whole long at a time (either 4 or 8 bytes),
2183 and do a fast unrolled copy if it only contains ASCII
2184 characters. */
2185 unsigned long data = *(unsigned long *) _s;
2186 if (data & ASCII_CHAR_MASK)
2187 break;
2188 _p[0] = (unsigned char) _s[0];
2189 _p[1] = (unsigned char) _s[1];
2190 _p[2] = (unsigned char) _s[2];
2191 _p[3] = (unsigned char) _s[3];
2192#if (SIZEOF_LONG == 8)
2193 _p[4] = (unsigned char) _s[4];
2194 _p[5] = (unsigned char) _s[5];
2195 _p[6] = (unsigned char) _s[6];
2196 _p[7] = (unsigned char) _s[7];
2197#endif
2198 _s += SIZEOF_LONG;
2199 _p += SIZEOF_LONG;
2200 }
2201 s = _s;
2202 p = _p;
2203 if (s == e)
2204 break;
2205 ch = (unsigned char)*s;
2206 }
2207 }
2208
2209 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 s++;
2212 continue;
2213 }
2214
2215 n = utf8_code_length[ch];
2216
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002217 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 if (consumed)
2219 break;
2220 else {
2221 errmsg = "unexpected end of data";
2222 startinpos = s-starts;
2223 endinpos = size;
2224 goto utf8Error;
2225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227
2228 switch (n) {
2229
2230 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002231 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002232 startinpos = s-starts;
2233 endinpos = startinpos+1;
2234 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235
2236 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002237 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002238 startinpos = s-starts;
2239 endinpos = startinpos+1;
2240 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241
2242 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002243 if ((s[1] & 0xc0) != 0x80) {
2244 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002245 startinpos = s-starts;
2246 endinpos = startinpos+2;
2247 goto utf8Error;
2248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002250 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002251 startinpos = s-starts;
2252 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002253 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002254 goto utf8Error;
2255 }
2256 else
2257 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 break;
2259
2260 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002261 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002262 (s[2] & 0xc0) != 0x80) {
2263 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 startinpos = s-starts;
2265 endinpos = startinpos+3;
2266 goto utf8Error;
2267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002269 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002270 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002271 startinpos = s-starts;
2272 endinpos = startinpos+3;
2273 goto utf8Error;
2274 }
2275 else
2276 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002277 break;
2278
2279 case 4:
2280 if ((s[1] & 0xc0) != 0x80 ||
2281 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002282 (s[3] & 0xc0) != 0x80) {
2283 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002284 startinpos = s-starts;
2285 endinpos = startinpos+4;
2286 goto utf8Error;
2287 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002288 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002290 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002291 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002293 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002294 UTF-16 */
2295 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002296 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002297 startinpos = s-starts;
2298 endinpos = startinpos+4;
2299 goto utf8Error;
2300 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002301#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002303#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002304 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002305
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002306 /* translate from 10000..10FFFF to 0..FFFF */
2307 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002308
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002309 /* high surrogate = top 10 bits added to D800 */
2310 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002311
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002312 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002313 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 break;
2316
2317 default:
2318 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002319 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002320 startinpos = s-starts;
2321 endinpos = startinpos+n;
2322 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 }
2324 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002325 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002326
Benjamin Peterson29060642009-01-31 22:14:21 +00002327 utf8Error:
2328 outpos = p-PyUnicode_AS_UNICODE(unicode);
2329 if (unicode_decode_call_errorhandler(
2330 errors, &errorHandler,
2331 "utf8", errmsg,
2332 &starts, &e, &startinpos, &endinpos, &exc, &s,
2333 &unicode, &outpos, &p))
2334 goto onError;
2335 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 }
Walter Dörwald69652032004-09-07 20:24:22 +00002337 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002338 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339
2340 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342 goto onError;
2343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 return (PyObject *)unicode;
2347
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 Py_XDECREF(errorHandler);
2350 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 Py_DECREF(unicode);
2352 return NULL;
2353}
2354
Antoine Pitrouab868312009-01-10 15:40:25 +00002355#undef ASCII_CHAR_MASK
2356
2357
Tim Peters602f7402002-04-27 18:03:26 +00002358/* Allocation strategy: if the string is short, convert into a stack buffer
2359 and allocate exactly as much space needed at the end. Else allocate the
2360 maximum possible needed (4 result bytes per Unicode character), and return
2361 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002362*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002363PyObject *
2364PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002365 Py_ssize_t size,
2366 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367{
Tim Peters602f7402002-04-27 18:03:26 +00002368#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002369
Guido van Rossum98297ee2007-11-06 21:34:58 +00002370 Py_ssize_t i; /* index into s of next input byte */
2371 PyObject *result; /* result string object */
2372 char *p; /* next free byte in output buffer */
2373 Py_ssize_t nallocated; /* number of result bytes allocated */
2374 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002375 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002378
Tim Peters602f7402002-04-27 18:03:26 +00002379 assert(s != NULL);
2380 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
Tim Peters602f7402002-04-27 18:03:26 +00002382 if (size <= MAX_SHORT_UNICHARS) {
2383 /* Write into the stack buffer; nallocated can't overflow.
2384 * At the end, we'll allocate exactly as much heap space as it
2385 * turns out we need.
2386 */
2387 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002388 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002389 p = stackbuf;
2390 }
2391 else {
2392 /* Overallocate on the heap, and give the excess back at the end. */
2393 nallocated = size * 4;
2394 if (nallocated / 4 != size) /* overflow! */
2395 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002396 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002397 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002398 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002399 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002400 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002401
Tim Peters602f7402002-04-27 18:03:26 +00002402 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002403 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002404
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002405 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002406 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002408
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002410 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002411 *p++ = (char)(0xc0 | (ch >> 6));
2412 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002413 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002414 else {
Tim Peters602f7402002-04-27 18:03:26 +00002415 /* Encode UCS2 Unicode ordinals */
2416 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002417#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002418 /* Special case: check for high surrogate */
2419 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2420 Py_UCS4 ch2 = s[i];
2421 /* Check for low surrogate and combine the two to
2422 form a UCS4 value */
2423 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002424 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002425 i++;
2426 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002427 }
Tim Peters602f7402002-04-27 18:03:26 +00002428 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002429 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002430#endif
2431 if (ch >= 0xd800 && ch <= 0xdfff) {
2432 Py_ssize_t newpos;
2433 PyObject *rep;
2434 char *prep;
2435 int k;
2436 rep = unicode_encode_call_errorhandler
2437 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2438 s, size, &exc, i-1, i, &newpos);
2439 if (!rep)
2440 goto error;
2441 /* Implementation limitations: only support error handler that return
2442 bytes, and only support up to four replacement bytes. */
2443 if (!PyBytes_Check(rep)) {
2444 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2445 Py_DECREF(rep);
2446 goto error;
2447 }
2448 if (PyBytes_Size(rep) > 4) {
2449 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2450 Py_DECREF(rep);
2451 goto error;
2452 }
2453 prep = PyBytes_AsString(rep);
2454 for(k = PyBytes_Size(rep); k > 0; k--)
2455 *p++ = *prep++;
2456 Py_DECREF(rep);
2457 continue;
2458
2459 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002460 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002461 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2462 *p++ = (char)(0x80 | (ch & 0x3f));
2463 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002464 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002466 /* Encode UCS4 Unicode ordinals */
2467 *p++ = (char)(0xf0 | (ch >> 18));
2468 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2469 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2470 *p++ = (char)(0x80 | (ch & 0x3f));
2471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002473
Guido van Rossum98297ee2007-11-06 21:34:58 +00002474 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002475 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002477 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002478 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002479 }
2480 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002481 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002482 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002483 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002484 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002485 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002486 Py_XDECREF(errorHandler);
2487 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002488 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002489 error:
2490 Py_XDECREF(errorHandler);
2491 Py_XDECREF(exc);
2492 Py_XDECREF(result);
2493 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002494
Tim Peters602f7402002-04-27 18:03:26 +00002495#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496}
2497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2499{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 if (!PyUnicode_Check(unicode)) {
2501 PyErr_BadArgument();
2502 return NULL;
2503 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 PyUnicode_GET_SIZE(unicode),
2506 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507}
2508
Walter Dörwald41980ca2007-08-16 21:55:45 +00002509/* --- UTF-32 Codec ------------------------------------------------------- */
2510
2511PyObject *
2512PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 Py_ssize_t size,
2514 const char *errors,
2515 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002516{
2517 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2518}
2519
2520PyObject *
2521PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 Py_ssize_t size,
2523 const char *errors,
2524 int *byteorder,
2525 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002526{
2527 const char *starts = s;
2528 Py_ssize_t startinpos;
2529 Py_ssize_t endinpos;
2530 Py_ssize_t outpos;
2531 PyUnicodeObject *unicode;
2532 Py_UNICODE *p;
2533#ifndef Py_UNICODE_WIDE
2534 int i, pairs;
2535#else
2536 const int pairs = 0;
2537#endif
2538 const unsigned char *q, *e;
2539 int bo = 0; /* assume native ordering by default */
2540 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002541 /* Offsets from q for retrieving bytes in the right order. */
2542#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2543 int iorder[] = {0, 1, 2, 3};
2544#else
2545 int iorder[] = {3, 2, 1, 0};
2546#endif
2547 PyObject *errorHandler = NULL;
2548 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002549 /* On narrow builds we split characters outside the BMP into two
2550 codepoints => count how much extra space we need. */
2551#ifndef Py_UNICODE_WIDE
2552 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002553 if (((Py_UCS4 *)s)[i] >= 0x10000)
2554 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002555#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002556
2557 /* This might be one to much, because of a BOM */
2558 unicode = _PyUnicode_New((size+3)/4+pairs);
2559 if (!unicode)
2560 return NULL;
2561 if (size == 0)
2562 return (PyObject *)unicode;
2563
2564 /* Unpack UTF-32 encoded data */
2565 p = unicode->str;
2566 q = (unsigned char *)s;
2567 e = q + size;
2568
2569 if (byteorder)
2570 bo = *byteorder;
2571
2572 /* Check for BOM marks (U+FEFF) in the input and adjust current
2573 byte order setting accordingly. In native mode, the leading BOM
2574 mark is skipped, in all other modes, it is copied to the output
2575 stream as-is (giving a ZWNBSP character). */
2576 if (bo == 0) {
2577 if (size >= 4) {
2578 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002579 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002580#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 if (bom == 0x0000FEFF) {
2582 q += 4;
2583 bo = -1;
2584 }
2585 else if (bom == 0xFFFE0000) {
2586 q += 4;
2587 bo = 1;
2588 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002589#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002590 if (bom == 0x0000FEFF) {
2591 q += 4;
2592 bo = 1;
2593 }
2594 else if (bom == 0xFFFE0000) {
2595 q += 4;
2596 bo = -1;
2597 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002598#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002600 }
2601
2602 if (bo == -1) {
2603 /* force LE */
2604 iorder[0] = 0;
2605 iorder[1] = 1;
2606 iorder[2] = 2;
2607 iorder[3] = 3;
2608 }
2609 else if (bo == 1) {
2610 /* force BE */
2611 iorder[0] = 3;
2612 iorder[1] = 2;
2613 iorder[2] = 1;
2614 iorder[3] = 0;
2615 }
2616
2617 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002618 Py_UCS4 ch;
2619 /* remaining bytes at the end? (size should be divisible by 4) */
2620 if (e-q<4) {
2621 if (consumed)
2622 break;
2623 errmsg = "truncated data";
2624 startinpos = ((const char *)q)-starts;
2625 endinpos = ((const char *)e)-starts;
2626 goto utf32Error;
2627 /* The remaining input chars are ignored if the callback
2628 chooses to skip the input */
2629 }
2630 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2631 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002632
Benjamin Peterson29060642009-01-31 22:14:21 +00002633 if (ch >= 0x110000)
2634 {
2635 errmsg = "codepoint not in range(0x110000)";
2636 startinpos = ((const char *)q)-starts;
2637 endinpos = startinpos+4;
2638 goto utf32Error;
2639 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 if (ch >= 0x10000)
2642 {
2643 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2644 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2645 }
2646 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002648 *p++ = ch;
2649 q += 4;
2650 continue;
2651 utf32Error:
2652 outpos = p-PyUnicode_AS_UNICODE(unicode);
2653 if (unicode_decode_call_errorhandler(
2654 errors, &errorHandler,
2655 "utf32", errmsg,
2656 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2657 &unicode, &outpos, &p))
2658 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002659 }
2660
2661 if (byteorder)
2662 *byteorder = bo;
2663
2664 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002666
2667 /* Adjust length */
2668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2669 goto onError;
2670
2671 Py_XDECREF(errorHandler);
2672 Py_XDECREF(exc);
2673 return (PyObject *)unicode;
2674
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002676 Py_DECREF(unicode);
2677 Py_XDECREF(errorHandler);
2678 Py_XDECREF(exc);
2679 return NULL;
2680}
2681
2682PyObject *
2683PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 Py_ssize_t size,
2685 const char *errors,
2686 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002687{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002688 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002689 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002690 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002691#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002692 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002693#else
2694 const int pairs = 0;
2695#endif
2696 /* Offsets from p for storing byte pairs in the right order. */
2697#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2698 int iorder[] = {0, 1, 2, 3};
2699#else
2700 int iorder[] = {3, 2, 1, 0};
2701#endif
2702
Benjamin Peterson29060642009-01-31 22:14:21 +00002703#define STORECHAR(CH) \
2704 do { \
2705 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2706 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2707 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2708 p[iorder[0]] = (CH) & 0xff; \
2709 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002710 } while(0)
2711
2712 /* In narrow builds we can output surrogate pairs as one codepoint,
2713 so we need less space. */
2714#ifndef Py_UNICODE_WIDE
2715 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002716 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2717 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2718 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002720 nsize = (size - pairs + (byteorder == 0));
2721 bytesize = nsize * 4;
2722 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002724 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002725 if (v == NULL)
2726 return NULL;
2727
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002728 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002729 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002731 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002732 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002733
2734 if (byteorder == -1) {
2735 /* force LE */
2736 iorder[0] = 0;
2737 iorder[1] = 1;
2738 iorder[2] = 2;
2739 iorder[3] = 3;
2740 }
2741 else if (byteorder == 1) {
2742 /* force BE */
2743 iorder[0] = 3;
2744 iorder[1] = 2;
2745 iorder[2] = 1;
2746 iorder[3] = 0;
2747 }
2748
2749 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002750 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002751#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2753 Py_UCS4 ch2 = *s;
2754 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2755 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2756 s++;
2757 size--;
2758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002760#endif
2761 STORECHAR(ch);
2762 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002763
2764 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002765 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766#undef STORECHAR
2767}
2768
2769PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2770{
2771 if (!PyUnicode_Check(unicode)) {
2772 PyErr_BadArgument();
2773 return NULL;
2774 }
2775 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 PyUnicode_GET_SIZE(unicode),
2777 NULL,
2778 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779}
2780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781/* --- UTF-16 Codec ------------------------------------------------------- */
2782
Tim Peters772747b2001-08-09 22:21:55 +00002783PyObject *
2784PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 Py_ssize_t size,
2786 const char *errors,
2787 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788{
Walter Dörwald69652032004-09-07 20:24:22 +00002789 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2790}
2791
Antoine Pitrouab868312009-01-10 15:40:25 +00002792/* Two masks for fast checking of whether a C 'long' may contain
2793 UTF16-encoded surrogate characters. This is an efficient heuristic,
2794 assuming that non-surrogate characters with a code point >= 0x8000 are
2795 rare in most input.
2796 FAST_CHAR_MASK is used when the input is in native byte ordering,
2797 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002798*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002799#if (SIZEOF_LONG == 8)
2800# define FAST_CHAR_MASK 0x8000800080008000L
2801# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2802#elif (SIZEOF_LONG == 4)
2803# define FAST_CHAR_MASK 0x80008000L
2804# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2805#else
2806# error C 'long' size should be either 4 or 8!
2807#endif
2808
Walter Dörwald69652032004-09-07 20:24:22 +00002809PyObject *
2810PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002811 Py_ssize_t size,
2812 const char *errors,
2813 int *byteorder,
2814 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002817 Py_ssize_t startinpos;
2818 Py_ssize_t endinpos;
2819 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 PyUnicodeObject *unicode;
2821 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002822 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002823 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002824 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002825 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002826 /* Offsets from q for retrieving byte pairs in the right order. */
2827#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2828 int ihi = 1, ilo = 0;
2829#else
2830 int ihi = 0, ilo = 1;
2831#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 PyObject *errorHandler = NULL;
2833 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
2835 /* Note: size will always be longer than the resulting Unicode
2836 character count */
2837 unicode = _PyUnicode_New(size);
2838 if (!unicode)
2839 return NULL;
2840 if (size == 0)
2841 return (PyObject *)unicode;
2842
2843 /* Unpack UTF-16 encoded data */
2844 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002845 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002846 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847
2848 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002849 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002851 /* Check for BOM marks (U+FEFF) in the input and adjust current
2852 byte order setting accordingly. In native mode, the leading BOM
2853 mark is skipped, in all other modes, it is copied to the output
2854 stream as-is (giving a ZWNBSP character). */
2855 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002856 if (size >= 2) {
2857 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002858#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 if (bom == 0xFEFF) {
2860 q += 2;
2861 bo = -1;
2862 }
2863 else if (bom == 0xFFFE) {
2864 q += 2;
2865 bo = 1;
2866 }
Tim Petersced69f82003-09-16 20:30:58 +00002867#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002868 if (bom == 0xFEFF) {
2869 q += 2;
2870 bo = 1;
2871 }
2872 else if (bom == 0xFFFE) {
2873 q += 2;
2874 bo = -1;
2875 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002876#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
Tim Peters772747b2001-08-09 22:21:55 +00002880 if (bo == -1) {
2881 /* force LE */
2882 ihi = 1;
2883 ilo = 0;
2884 }
2885 else if (bo == 1) {
2886 /* force BE */
2887 ihi = 0;
2888 ilo = 1;
2889 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002890#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2891 native_ordering = ilo < ihi;
2892#else
2893 native_ordering = ilo > ihi;
2894#endif
Tim Peters772747b2001-08-09 22:21:55 +00002895
Antoine Pitrouab868312009-01-10 15:40:25 +00002896 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002897 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002899 /* First check for possible aligned read of a C 'long'. Unaligned
2900 reads are more expensive, better to defer to another iteration. */
2901 if (!((size_t) q & LONG_PTR_MASK)) {
2902 /* Fast path for runs of non-surrogate chars. */
2903 register const unsigned char *_q = q;
2904 Py_UNICODE *_p = p;
2905 if (native_ordering) {
2906 /* Native ordering is simple: as long as the input cannot
2907 possibly contain a surrogate char, do an unrolled copy
2908 of several 16-bit code points to the target object.
2909 The non-surrogate check is done on several input bytes
2910 at a time (as many as a C 'long' can contain). */
2911 while (_q < aligned_end) {
2912 unsigned long data = * (unsigned long *) _q;
2913 if (data & FAST_CHAR_MASK)
2914 break;
2915 _p[0] = ((unsigned short *) _q)[0];
2916 _p[1] = ((unsigned short *) _q)[1];
2917#if (SIZEOF_LONG == 8)
2918 _p[2] = ((unsigned short *) _q)[2];
2919 _p[3] = ((unsigned short *) _q)[3];
2920#endif
2921 _q += SIZEOF_LONG;
2922 _p += SIZEOF_LONG / 2;
2923 }
2924 }
2925 else {
2926 /* Byteswapped ordering is similar, but we must decompose
2927 the copy bytewise, and take care of zero'ing out the
2928 upper bytes if the target object is in 32-bit units
2929 (that is, in UCS-4 builds). */
2930 while (_q < aligned_end) {
2931 unsigned long data = * (unsigned long *) _q;
2932 if (data & SWAPPED_FAST_CHAR_MASK)
2933 break;
2934 /* Zero upper bytes in UCS-4 builds */
2935#if (Py_UNICODE_SIZE > 2)
2936 _p[0] = 0;
2937 _p[1] = 0;
2938#if (SIZEOF_LONG == 8)
2939 _p[2] = 0;
2940 _p[3] = 0;
2941#endif
2942#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002943 /* Issue #4916; UCS-4 builds on big endian machines must
2944 fill the two last bytes of each 4-byte unit. */
2945#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2946# define OFF 2
2947#else
2948# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00002949#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00002950 ((unsigned char *) _p)[OFF + 1] = _q[0];
2951 ((unsigned char *) _p)[OFF + 0] = _q[1];
2952 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2953 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2954#if (SIZEOF_LONG == 8)
2955 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2956 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2957 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2958 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2959#endif
2960#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00002961 _q += SIZEOF_LONG;
2962 _p += SIZEOF_LONG / 2;
2963 }
2964 }
2965 p = _p;
2966 q = _q;
2967 if (q >= e)
2968 break;
2969 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002971
Benjamin Peterson14339b62009-01-31 16:36:08 +00002972 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00002973
2974 if (ch < 0xD800 || ch > 0xDFFF) {
2975 *p++ = ch;
2976 continue;
2977 }
2978
2979 /* UTF-16 code pair: */
2980 if (q > e) {
2981 errmsg = "unexpected end of data";
2982 startinpos = (((const char *)q) - 2) - starts;
2983 endinpos = ((const char *)e) + 1 - starts;
2984 goto utf16Error;
2985 }
2986 if (0xD800 <= ch && ch <= 0xDBFF) {
2987 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2988 q += 2;
2989 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002990#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 *p++ = ch;
2992 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002993#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002995#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 continue;
2997 }
2998 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002999 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 startinpos = (((const char *)q)-4)-starts;
3001 endinpos = startinpos+2;
3002 goto utf16Error;
3003 }
3004
Benjamin Peterson14339b62009-01-31 16:36:08 +00003005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003006 errmsg = "illegal encoding";
3007 startinpos = (((const char *)q)-2)-starts;
3008 endinpos = startinpos+2;
3009 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 utf16Error:
3012 outpos = p - PyUnicode_AS_UNICODE(unicode);
3013 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003014 errors,
3015 &errorHandler,
3016 "utf16", errmsg,
3017 &starts,
3018 (const char **)&e,
3019 &startinpos,
3020 &endinpos,
3021 &exc,
3022 (const char **)&q,
3023 &unicode,
3024 &outpos,
3025 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003028 /* remaining byte at the end? (size should be even) */
3029 if (e == q) {
3030 if (!consumed) {
3031 errmsg = "truncated data";
3032 startinpos = ((const char *)q) - starts;
3033 endinpos = ((const char *)e) + 1 - starts;
3034 outpos = p - PyUnicode_AS_UNICODE(unicode);
3035 if (unicode_decode_call_errorhandler(
3036 errors,
3037 &errorHandler,
3038 "utf16", errmsg,
3039 &starts,
3040 (const char **)&e,
3041 &startinpos,
3042 &endinpos,
3043 &exc,
3044 (const char **)&q,
3045 &unicode,
3046 &outpos,
3047 &p))
3048 goto onError;
3049 /* The remaining input chars are ignored if the callback
3050 chooses to skip the input */
3051 }
3052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
3054 if (byteorder)
3055 *byteorder = bo;
3056
Walter Dörwald69652032004-09-07 20:24:22 +00003057 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003059
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003061 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 goto onError;
3063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 Py_XDECREF(errorHandler);
3065 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return (PyObject *)unicode;
3067
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 return NULL;
3073}
3074
Antoine Pitrouab868312009-01-10 15:40:25 +00003075#undef FAST_CHAR_MASK
3076#undef SWAPPED_FAST_CHAR_MASK
3077
Tim Peters772747b2001-08-09 22:21:55 +00003078PyObject *
3079PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 Py_ssize_t size,
3081 const char *errors,
3082 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003084 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003085 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003086 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003087#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003088 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003089#else
3090 const int pairs = 0;
3091#endif
Tim Peters772747b2001-08-09 22:21:55 +00003092 /* Offsets from p for storing byte pairs in the right order. */
3093#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3094 int ihi = 1, ilo = 0;
3095#else
3096 int ihi = 0, ilo = 1;
3097#endif
3098
Benjamin Peterson29060642009-01-31 22:14:21 +00003099#define STORECHAR(CH) \
3100 do { \
3101 p[ihi] = ((CH) >> 8) & 0xff; \
3102 p[ilo] = (CH) & 0xff; \
3103 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003104 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003106#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003107 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 if (s[i] >= 0x10000)
3109 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003110#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003111 /* 2 * (size + pairs + (byteorder == 0)) */
3112 if (size > PY_SSIZE_T_MAX ||
3113 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003115 nsize = size + pairs + (byteorder == 0);
3116 bytesize = nsize * 2;
3117 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003119 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 if (v == NULL)
3121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003123 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003126 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003127 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003128
3129 if (byteorder == -1) {
3130 /* force LE */
3131 ihi = 1;
3132 ilo = 0;
3133 }
3134 else if (byteorder == 1) {
3135 /* force BE */
3136 ihi = 0;
3137 ilo = 1;
3138 }
3139
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003140 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 Py_UNICODE ch = *s++;
3142 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003143#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 if (ch >= 0x10000) {
3145 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3146 ch = 0xD800 | ((ch-0x10000) >> 10);
3147 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003148#endif
Tim Peters772747b2001-08-09 22:21:55 +00003149 STORECHAR(ch);
3150 if (ch2)
3151 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003152 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003153
3154 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003155 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003156#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157}
3158
3159PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3160{
3161 if (!PyUnicode_Check(unicode)) {
3162 PyErr_BadArgument();
3163 return NULL;
3164 }
3165 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 PyUnicode_GET_SIZE(unicode),
3167 NULL,
3168 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169}
3170
3171/* --- Unicode Escape Codec ----------------------------------------------- */
3172
Fredrik Lundh06d12682001-01-24 07:59:11 +00003173static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003174
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 Py_ssize_t size,
3177 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003180 Py_ssize_t startinpos;
3181 Py_ssize_t endinpos;
3182 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003187 char* message;
3188 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 PyObject *errorHandler = NULL;
3190 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003191
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 /* Escaped strings will always be longer than the resulting
3193 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 length after conversion to the true value.
3195 (but if the error callback returns a long replacement string
3196 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 v = _PyUnicode_New(size);
3198 if (v == NULL)
3199 goto onError;
3200 if (size == 0)
3201 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 while (s < end) {
3207 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003208 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210
3211 /* Non-escape characters are interpreted as Unicode ordinals */
3212 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003213 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 continue;
3215 }
3216
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 /* \ - Escapes */
3219 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003220 c = *s++;
3221 if (s > end)
3222 c = '\0'; /* Invalid after \ */
3223 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 case '\n': break;
3227 case '\\': *p++ = '\\'; break;
3228 case '\'': *p++ = '\''; break;
3229 case '\"': *p++ = '\"'; break;
3230 case 'b': *p++ = '\b'; break;
3231 case 'f': *p++ = '\014'; break; /* FF */
3232 case 't': *p++ = '\t'; break;
3233 case 'n': *p++ = '\n'; break;
3234 case 'r': *p++ = '\r'; break;
3235 case 'v': *p++ = '\013'; break; /* VT */
3236 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3237
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 case '0': case '1': case '2': case '3':
3240 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003241 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003242 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003243 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003244 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003245 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003247 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 break;
3249
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 /* hex escapes */
3251 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003253 digits = 2;
3254 message = "truncated \\xXX escape";
3255 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003259 digits = 4;
3260 message = "truncated \\uXXXX escape";
3261 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003264 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003265 digits = 8;
3266 message = "truncated \\UXXXXXXXX escape";
3267 hexescape:
3268 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 outpos = p-PyUnicode_AS_UNICODE(v);
3270 if (s+digits>end) {
3271 endinpos = size;
3272 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 errors, &errorHandler,
3274 "unicodeescape", "end of string in escape sequence",
3275 &starts, &end, &startinpos, &endinpos, &exc, &s,
3276 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 goto onError;
3278 goto nextByte;
3279 }
3280 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003281 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003282 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 endinpos = (s+i+1)-starts;
3284 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 errors, &errorHandler,
3286 "unicodeescape", message,
3287 &starts, &end, &startinpos, &endinpos, &exc, &s,
3288 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003291 }
3292 chr = (chr<<4) & ~0xF;
3293 if (c >= '0' && c <= '9')
3294 chr += c - '0';
3295 else if (c >= 'a' && c <= 'f')
3296 chr += 10 + c - 'a';
3297 else
3298 chr += 10 + c - 'A';
3299 }
3300 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003301 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 /* _decoding_error will have already written into the
3303 target buffer. */
3304 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003305 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003306 /* when we get here, chr is a 32-bit unicode character */
3307 if (chr <= 0xffff)
3308 /* UCS-2 character */
3309 *p++ = (Py_UNICODE) chr;
3310 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003311 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003312 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003313#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003314 *p++ = chr;
3315#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003316 chr -= 0x10000L;
3317 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003318 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003319#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003320 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 endinpos = s-starts;
3322 outpos = p-PyUnicode_AS_UNICODE(v);
3323 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 errors, &errorHandler,
3325 "unicodeescape", "illegal Unicode character",
3326 &starts, &end, &startinpos, &endinpos, &exc, &s,
3327 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003328 goto onError;
3329 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003330 break;
3331
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003333 case 'N':
3334 message = "malformed \\N character escape";
3335 if (ucnhash_CAPI == NULL) {
3336 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003337 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003338 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003339 if (m == NULL)
3340 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003341 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003342 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003343 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003344 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003345 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003346 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003347 if (ucnhash_CAPI == NULL)
3348 goto ucnhashError;
3349 }
3350 if (*s == '{') {
3351 const char *start = s+1;
3352 /* look for the closing brace */
3353 while (*s != '}' && s < end)
3354 s++;
3355 if (s > start && s < end && *s == '}') {
3356 /* found a name. look it up in the unicode database */
3357 message = "unknown Unicode character name";
3358 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003359 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003360 goto store;
3361 }
3362 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 endinpos = s-starts;
3364 outpos = p-PyUnicode_AS_UNICODE(v);
3365 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 errors, &errorHandler,
3367 "unicodeescape", message,
3368 &starts, &end, &startinpos, &endinpos, &exc, &s,
3369 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003370 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003371 break;
3372
3373 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003374 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 message = "\\ at end of string";
3376 s--;
3377 endinpos = s-starts;
3378 outpos = p-PyUnicode_AS_UNICODE(v);
3379 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 errors, &errorHandler,
3381 "unicodeescape", message,
3382 &starts, &end, &startinpos, &endinpos, &exc, &s,
3383 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003384 goto onError;
3385 }
3386 else {
3387 *p++ = '\\';
3388 *p++ = (unsigned char)s[-1];
3389 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003390 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003395 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003397 Py_XDECREF(errorHandler);
3398 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003400
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003402 PyErr_SetString(
3403 PyExc_UnicodeError,
3404 "\\N escapes not supported (can't load unicodedata module)"
3405 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003406 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 Py_XDECREF(errorHandler);
3408 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003409 return NULL;
3410
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 Py_XDECREF(errorHandler);
3414 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 return NULL;
3416}
3417
3418/* Return a Unicode-Escape string version of the Unicode object.
3419
3420 If quotes is true, the string is enclosed in u"" or u'' quotes as
3421 appropriate.
3422
3423*/
3424
Thomas Wouters477c8d52006-05-27 19:21:47 +00003425Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 Py_ssize_t size,
3427 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003428{
3429 /* like wcschr, but doesn't stop at NULL characters */
3430
3431 while (size-- > 0) {
3432 if (*s == ch)
3433 return s;
3434 s++;
3435 }
3436
3437 return NULL;
3438}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003439
Walter Dörwald79e913e2007-05-12 11:08:06 +00003440static const char *hexdigits = "0123456789abcdef";
3441
3442PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003445 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003448#ifdef Py_UNICODE_WIDE
3449 const Py_ssize_t expandsize = 10;
3450#else
3451 const Py_ssize_t expandsize = 6;
3452#endif
3453
Thomas Wouters89f507f2006-12-13 04:49:30 +00003454 /* XXX(nnorwitz): rather than over-allocating, it would be
3455 better to choose a different scheme. Perhaps scan the
3456 first N-chars of the string and allocate based on that size.
3457 */
3458 /* Initial allocation is based on the longest-possible unichr
3459 escape.
3460
3461 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3462 unichr, so in this case it's the longest unichr escape. In
3463 narrow (UTF-16) builds this is five chars per source unichr
3464 since there are two unichrs in the surrogate pair, so in narrow
3465 (UTF-16) builds it's not the longest unichr escape.
3466
3467 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3468 so in the narrow (UTF-16) build case it's the longest unichr
3469 escape.
3470 */
3471
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003472 if (size == 0)
3473 return PyBytes_FromStringAndSize(NULL, 0);
3474
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003475 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003476 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003477
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003478 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 2
3480 + expandsize*size
3481 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 if (repr == NULL)
3483 return NULL;
3484
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003485 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 while (size-- > 0) {
3488 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003489
Walter Dörwald79e913e2007-05-12 11:08:06 +00003490 /* Escape backslashes */
3491 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 *p++ = '\\';
3493 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003494 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003495 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003496
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003497#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003498 /* Map 21-bit characters to '\U00xxxxxx' */
3499 else if (ch >= 0x10000) {
3500 *p++ = '\\';
3501 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003502 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3503 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3504 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3505 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3506 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3507 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3508 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3509 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003511 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003512#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3514 else if (ch >= 0xD800 && ch < 0xDC00) {
3515 Py_UNICODE ch2;
3516 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003517
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 ch2 = *s++;
3519 size--;
3520 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3521 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3522 *p++ = '\\';
3523 *p++ = 'U';
3524 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3525 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3526 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3527 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3528 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3529 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3530 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3531 *p++ = hexdigits[ucs & 0x0000000F];
3532 continue;
3533 }
3534 /* Fall through: isolated surrogates are copied as-is */
3535 s--;
3536 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003537 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003538#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003539
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003541 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 *p++ = '\\';
3543 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003544 *p++ = hexdigits[(ch >> 12) & 0x000F];
3545 *p++ = hexdigits[(ch >> 8) & 0x000F];
3546 *p++ = hexdigits[(ch >> 4) & 0x000F];
3547 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003549
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003550 /* Map special whitespace to '\t', \n', '\r' */
3551 else if (ch == '\t') {
3552 *p++ = '\\';
3553 *p++ = 't';
3554 }
3555 else if (ch == '\n') {
3556 *p++ = '\\';
3557 *p++ = 'n';
3558 }
3559 else if (ch == '\r') {
3560 *p++ = '\\';
3561 *p++ = 'r';
3562 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003563
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003564 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003565 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003567 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003568 *p++ = hexdigits[(ch >> 4) & 0x000F];
3569 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003570 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003571
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 /* Copy everything else as-is */
3573 else
3574 *p++ = (char) ch;
3575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003577 assert(p - PyBytes_AS_STRING(repr) > 0);
3578 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3579 return NULL;
3580 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581}
3582
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003583PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003585 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 if (!PyUnicode_Check(unicode)) {
3587 PyErr_BadArgument();
3588 return NULL;
3589 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003590 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3591 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003592 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593}
3594
3595/* --- Raw Unicode Escape Codec ------------------------------------------- */
3596
3597PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 Py_ssize_t size,
3599 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 Py_ssize_t startinpos;
3603 Py_ssize_t endinpos;
3604 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 const char *end;
3608 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 PyObject *errorHandler = NULL;
3610 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003611
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 /* Escaped strings will always be longer than the resulting
3613 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 length after conversion to the true value. (But decoding error
3615 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 v = _PyUnicode_New(size);
3617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 end = s + size;
3623 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 unsigned char c;
3625 Py_UCS4 x;
3626 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003627 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 /* Non-escape characters are interpreted as Unicode ordinals */
3630 if (*s != '\\') {
3631 *p++ = (unsigned char)*s++;
3632 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003633 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 startinpos = s-starts;
3635
3636 /* \u-escapes are only interpreted iff the number of leading
3637 backslashes if odd */
3638 bs = s;
3639 for (;s < end;) {
3640 if (*s != '\\')
3641 break;
3642 *p++ = (unsigned char)*s++;
3643 }
3644 if (((s - bs) & 1) == 0 ||
3645 s >= end ||
3646 (*s != 'u' && *s != 'U')) {
3647 continue;
3648 }
3649 p--;
3650 count = *s=='u' ? 4 : 8;
3651 s++;
3652
3653 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3654 outpos = p-PyUnicode_AS_UNICODE(v);
3655 for (x = 0, i = 0; i < count; ++i, ++s) {
3656 c = (unsigned char)*s;
3657 if (!ISXDIGIT(c)) {
3658 endinpos = s-starts;
3659 if (unicode_decode_call_errorhandler(
3660 errors, &errorHandler,
3661 "rawunicodeescape", "truncated \\uXXXX",
3662 &starts, &end, &startinpos, &endinpos, &exc, &s,
3663 &v, &outpos, &p))
3664 goto onError;
3665 goto nextByte;
3666 }
3667 x = (x<<4) & ~0xF;
3668 if (c >= '0' && c <= '9')
3669 x += c - '0';
3670 else if (c >= 'a' && c <= 'f')
3671 x += 10 + c - 'a';
3672 else
3673 x += 10 + c - 'A';
3674 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003675 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 /* UCS-2 character */
3677 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003678 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 /* UCS-4 character. Either store directly, or as
3680 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003681#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003682 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003683#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 x -= 0x10000L;
3685 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3686 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003687#endif
3688 } else {
3689 endinpos = s-starts;
3690 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003691 if (unicode_decode_call_errorhandler(
3692 errors, &errorHandler,
3693 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 &starts, &end, &startinpos, &endinpos, &exc, &s,
3695 &v, &outpos, &p))
3696 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 nextByte:
3699 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003701 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 Py_XDECREF(errorHandler);
3704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003706
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003709 Py_XDECREF(errorHandler);
3710 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
3714PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003717 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 char *p;
3719 char *q;
3720
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003721#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003722 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003723#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003724 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003725#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003726
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003727 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003729
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003730 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 if (repr == NULL)
3732 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003733 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003734 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003736 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 while (size-- > 0) {
3738 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003739#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 /* Map 32-bit characters to '\Uxxxxxxxx' */
3741 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003742 *p++ = '\\';
3743 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003744 *p++ = hexdigits[(ch >> 28) & 0xf];
3745 *p++ = hexdigits[(ch >> 24) & 0xf];
3746 *p++ = hexdigits[(ch >> 20) & 0xf];
3747 *p++ = hexdigits[(ch >> 16) & 0xf];
3748 *p++ = hexdigits[(ch >> 12) & 0xf];
3749 *p++ = hexdigits[(ch >> 8) & 0xf];
3750 *p++ = hexdigits[(ch >> 4) & 0xf];
3751 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003752 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003753 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003754#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3756 if (ch >= 0xD800 && ch < 0xDC00) {
3757 Py_UNICODE ch2;
3758 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003759
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 ch2 = *s++;
3761 size--;
3762 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3763 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3764 *p++ = '\\';
3765 *p++ = 'U';
3766 *p++ = hexdigits[(ucs >> 28) & 0xf];
3767 *p++ = hexdigits[(ucs >> 24) & 0xf];
3768 *p++ = hexdigits[(ucs >> 20) & 0xf];
3769 *p++ = hexdigits[(ucs >> 16) & 0xf];
3770 *p++ = hexdigits[(ucs >> 12) & 0xf];
3771 *p++ = hexdigits[(ucs >> 8) & 0xf];
3772 *p++ = hexdigits[(ucs >> 4) & 0xf];
3773 *p++ = hexdigits[ucs & 0xf];
3774 continue;
3775 }
3776 /* Fall through: isolated surrogates are copied as-is */
3777 s--;
3778 size++;
3779 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003780#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 /* Map 16-bit characters to '\uxxxx' */
3782 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 *p++ = '\\';
3784 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003785 *p++ = hexdigits[(ch >> 12) & 0xf];
3786 *p++ = hexdigits[(ch >> 8) & 0xf];
3787 *p++ = hexdigits[(ch >> 4) & 0xf];
3788 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* Copy everything else as-is */
3791 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 *p++ = (char) ch;
3793 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003794 size = p - q;
3795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003796 assert(size > 0);
3797 if (_PyBytes_Resize(&repr, size) < 0)
3798 return NULL;
3799 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800}
3801
3802PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3803{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003804 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003806 PyErr_BadArgument();
3807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003809 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3810 PyUnicode_GET_SIZE(unicode));
3811
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003812 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813}
3814
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003815/* --- Unicode Internal Codec ------------------------------------------- */
3816
3817PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 Py_ssize_t size,
3819 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003820{
3821 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 Py_ssize_t startinpos;
3823 Py_ssize_t endinpos;
3824 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003825 PyUnicodeObject *v;
3826 Py_UNICODE *p;
3827 const char *end;
3828 const char *reason;
3829 PyObject *errorHandler = NULL;
3830 PyObject *exc = NULL;
3831
Neal Norwitzd43069c2006-01-08 01:12:10 +00003832#ifdef Py_UNICODE_WIDE
3833 Py_UNICODE unimax = PyUnicode_GetMax();
3834#endif
3835
Thomas Wouters89f507f2006-12-13 04:49:30 +00003836 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003837 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3838 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003840 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003842 p = PyUnicode_AS_UNICODE(v);
3843 end = s + size;
3844
3845 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003846 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003847 /* We have to sanity check the raw data, otherwise doom looms for
3848 some malformed UCS-4 data. */
3849 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003850#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003851 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003852#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003853 end-s < Py_UNICODE_SIZE
3854 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003856 startinpos = s - starts;
3857 if (end-s < Py_UNICODE_SIZE) {
3858 endinpos = end-starts;
3859 reason = "truncated input";
3860 }
3861 else {
3862 endinpos = s - starts + Py_UNICODE_SIZE;
3863 reason = "illegal code point (> 0x10FFFF)";
3864 }
3865 outpos = p - PyUnicode_AS_UNICODE(v);
3866 if (unicode_decode_call_errorhandler(
3867 errors, &errorHandler,
3868 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003869 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003870 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003871 goto onError;
3872 }
3873 }
3874 else {
3875 p++;
3876 s += Py_UNICODE_SIZE;
3877 }
3878 }
3879
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003880 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003881 goto onError;
3882 Py_XDECREF(errorHandler);
3883 Py_XDECREF(exc);
3884 return (PyObject *)v;
3885
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003887 Py_XDECREF(v);
3888 Py_XDECREF(errorHandler);
3889 Py_XDECREF(exc);
3890 return NULL;
3891}
3892
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893/* --- Latin-1 Codec ------------------------------------------------------ */
3894
3895PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003896 Py_ssize_t size,
3897 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898{
3899 PyUnicodeObject *v;
3900 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003901 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003904 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003905 Py_UNICODE r = *(unsigned char*)s;
3906 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003907 }
3908
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 v = _PyUnicode_New(size);
3910 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003915 e = s + size;
3916 /* Unrolling the copy makes it much faster by reducing the looping
3917 overhead. This is similar to what many memcpy() implementations do. */
3918 unrolled_end = e - 4;
3919 while (s < unrolled_end) {
3920 p[0] = (unsigned char) s[0];
3921 p[1] = (unsigned char) s[1];
3922 p[2] = (unsigned char) s[2];
3923 p[3] = (unsigned char) s[3];
3924 s += 4;
3925 p += 4;
3926 }
3927 while (s < e)
3928 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003930
Benjamin Peterson29060642009-01-31 22:14:21 +00003931 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 Py_XDECREF(v);
3933 return NULL;
3934}
3935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936/* create or adjust a UnicodeEncodeError */
3937static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 const char *encoding,
3939 const Py_UNICODE *unicode, Py_ssize_t size,
3940 Py_ssize_t startpos, Py_ssize_t endpos,
3941 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 *exceptionObject = PyUnicodeEncodeError_Create(
3945 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 }
3947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3949 goto onError;
3950 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3951 goto onError;
3952 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3953 goto onError;
3954 return;
3955 onError:
3956 Py_DECREF(*exceptionObject);
3957 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 }
3959}
3960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961/* raises a UnicodeEncodeError */
3962static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 const char *encoding,
3964 const Py_UNICODE *unicode, Py_ssize_t size,
3965 Py_ssize_t startpos, Py_ssize_t endpos,
3966 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967{
3968 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972}
3973
3974/* error handling callback helper:
3975 build arguments, call the callback and check the arguments,
3976 put the result into newpos and return the replacement string, which
3977 has to be freed by the caller */
3978static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 PyObject **errorHandler,
3980 const char *encoding, const char *reason,
3981 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3982 Py_ssize_t startpos, Py_ssize_t endpos,
3983 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003985 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986
3987 PyObject *restuple;
3988 PyObject *resunicode;
3989
3990 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 }
3995
3996 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000
4001 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004006 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 Py_DECREF(restuple);
4008 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004010 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 &resunicode, newpos)) {
4012 Py_DECREF(restuple);
4013 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004015 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4016 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4017 Py_DECREF(restuple);
4018 return NULL;
4019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004022 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4024 Py_DECREF(restuple);
4025 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_INCREF(resunicode);
4028 Py_DECREF(restuple);
4029 return resunicode;
4030}
4031
4032static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004033 Py_ssize_t size,
4034 const char *errors,
4035 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036{
4037 /* output object */
4038 PyObject *res;
4039 /* pointers to the beginning and end+1 of input */
4040 const Py_UNICODE *startp = p;
4041 const Py_UNICODE *endp = p + size;
4042 /* pointer to the beginning of the unencodable characters */
4043 /* const Py_UNICODE *badp = NULL; */
4044 /* pointer into the output */
4045 char *str;
4046 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004047 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004048 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4049 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 PyObject *errorHandler = NULL;
4051 PyObject *exc = NULL;
4052 /* the following variable is used for caching string comparisons
4053 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4054 int known_errorHandler = -1;
4055
4056 /* allocate enough for a simple encoding without
4057 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004058 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004059 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004060 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004062 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 ressize = size;
4065
4066 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 /* can we encode this? */
4070 if (c<limit) {
4071 /* no overflow check, because we know that the space is enough */
4072 *str++ = (char)c;
4073 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 else {
4076 Py_ssize_t unicodepos = p-startp;
4077 Py_ssize_t requiredsize;
4078 PyObject *repunicode;
4079 Py_ssize_t repsize;
4080 Py_ssize_t newpos;
4081 Py_ssize_t respos;
4082 Py_UNICODE *uni2;
4083 /* startpos for collecting unencodable chars */
4084 const Py_UNICODE *collstart = p;
4085 const Py_UNICODE *collend = p;
4086 /* find all unecodable characters */
4087 while ((collend < endp) && ((*collend)>=limit))
4088 ++collend;
4089 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4090 if (known_errorHandler==-1) {
4091 if ((errors==NULL) || (!strcmp(errors, "strict")))
4092 known_errorHandler = 1;
4093 else if (!strcmp(errors, "replace"))
4094 known_errorHandler = 2;
4095 else if (!strcmp(errors, "ignore"))
4096 known_errorHandler = 3;
4097 else if (!strcmp(errors, "xmlcharrefreplace"))
4098 known_errorHandler = 4;
4099 else
4100 known_errorHandler = 0;
4101 }
4102 switch (known_errorHandler) {
4103 case 1: /* strict */
4104 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4105 goto onError;
4106 case 2: /* replace */
4107 while (collstart++<collend)
4108 *str++ = '?'; /* fall through */
4109 case 3: /* ignore */
4110 p = collend;
4111 break;
4112 case 4: /* xmlcharrefreplace */
4113 respos = str - PyBytes_AS_STRING(res);
4114 /* determine replacement size (temporarily (mis)uses p) */
4115 for (p = collstart, repsize = 0; p < collend; ++p) {
4116 if (*p<10)
4117 repsize += 2+1+1;
4118 else if (*p<100)
4119 repsize += 2+2+1;
4120 else if (*p<1000)
4121 repsize += 2+3+1;
4122 else if (*p<10000)
4123 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004124#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 else
4126 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004127#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 else if (*p<100000)
4129 repsize += 2+5+1;
4130 else if (*p<1000000)
4131 repsize += 2+6+1;
4132 else
4133 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004134#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 }
4136 requiredsize = respos+repsize+(endp-collend);
4137 if (requiredsize > ressize) {
4138 if (requiredsize<2*ressize)
4139 requiredsize = 2*ressize;
4140 if (_PyBytes_Resize(&res, requiredsize))
4141 goto onError;
4142 str = PyBytes_AS_STRING(res) + respos;
4143 ressize = requiredsize;
4144 }
4145 /* generate replacement (temporarily (mis)uses p) */
4146 for (p = collstart; p < collend; ++p) {
4147 str += sprintf(str, "&#%d;", (int)*p);
4148 }
4149 p = collend;
4150 break;
4151 default:
4152 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4153 encoding, reason, startp, size, &exc,
4154 collstart-startp, collend-startp, &newpos);
4155 if (repunicode == NULL)
4156 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004157 if (!PyUnicode_Check(repunicode)) {
4158 /* Implementation limitation: byte results not supported yet. */
4159 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
4160 Py_DECREF(repunicode);
4161 goto onError;
4162 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 /* need more space? (at least enough for what we
4164 have+the replacement+the rest of the string, so
4165 we won't have to check space for encodable characters) */
4166 respos = str - PyBytes_AS_STRING(res);
4167 repsize = PyUnicode_GET_SIZE(repunicode);
4168 requiredsize = respos+repsize+(endp-collend);
4169 if (requiredsize > ressize) {
4170 if (requiredsize<2*ressize)
4171 requiredsize = 2*ressize;
4172 if (_PyBytes_Resize(&res, requiredsize)) {
4173 Py_DECREF(repunicode);
4174 goto onError;
4175 }
4176 str = PyBytes_AS_STRING(res) + respos;
4177 ressize = requiredsize;
4178 }
4179 /* check if there is anything unencodable in the replacement
4180 and copy it to the output */
4181 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4182 c = *uni2;
4183 if (c >= limit) {
4184 raise_encode_exception(&exc, encoding, startp, size,
4185 unicodepos, unicodepos+1, reason);
4186 Py_DECREF(repunicode);
4187 goto onError;
4188 }
4189 *str = (char)c;
4190 }
4191 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004192 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004194 }
4195 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004196 /* Resize if we allocated to much */
4197 size = str - PyBytes_AS_STRING(res);
4198 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004199 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004200 if (_PyBytes_Resize(&res, size) < 0)
4201 goto onError;
4202 }
4203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(errorHandler);
4205 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004206 return res;
4207
4208 onError:
4209 Py_XDECREF(res);
4210 Py_XDECREF(errorHandler);
4211 Py_XDECREF(exc);
4212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213}
4214
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 Py_ssize_t size,
4217 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220}
4221
4222PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4223{
4224 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 PyErr_BadArgument();
4226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 }
4228 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 PyUnicode_GET_SIZE(unicode),
4230 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231}
4232
4233/* --- 7-bit ASCII Codec -------------------------------------------------- */
4234
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 Py_ssize_t size,
4237 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240 PyUnicodeObject *v;
4241 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004242 Py_ssize_t startinpos;
4243 Py_ssize_t endinpos;
4244 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 const char *e;
4246 PyObject *errorHandler = NULL;
4247 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004248
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004250 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 Py_UNICODE r = *(unsigned char*)s;
4252 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004253 }
Tim Petersced69f82003-09-16 20:30:58 +00004254
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 v = _PyUnicode_New(size);
4256 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 e = s + size;
4262 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 register unsigned char c = (unsigned char)*s;
4264 if (c < 128) {
4265 *p++ = c;
4266 ++s;
4267 }
4268 else {
4269 startinpos = s-starts;
4270 endinpos = startinpos + 1;
4271 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4272 if (unicode_decode_call_errorhandler(
4273 errors, &errorHandler,
4274 "ascii", "ordinal not in range(128)",
4275 &starts, &e, &startinpos, &endinpos, &exc, &s,
4276 &v, &outpos, &p))
4277 goto onError;
4278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004280 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 Py_XDECREF(errorHandler);
4284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004286
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 Py_XDECREF(errorHandler);
4290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return NULL;
4292}
4293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 Py_ssize_t size,
4296 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299}
4300
4301PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4302{
4303 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 PyErr_BadArgument();
4305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 }
4307 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 PyUnicode_GET_SIZE(unicode),
4309 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310}
4311
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004312#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004313
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004314/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004315
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004316#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004317#define NEED_RETRY
4318#endif
4319
4320/* XXX This code is limited to "true" double-byte encodings, as
4321 a) it assumes an incomplete character consists of a single byte, and
4322 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004324
4325static int is_dbcs_lead_byte(const char *s, int offset)
4326{
4327 const char *curr = s + offset;
4328
4329 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 const char *prev = CharPrev(s, curr);
4331 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004332 }
4333 return 0;
4334}
4335
4336/*
4337 * Decode MBCS string into unicode object. If 'final' is set, converts
4338 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4339 */
4340static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 const char *s, /* MBCS string */
4342 int size, /* sizeof MBCS string */
4343 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004344{
4345 Py_UNICODE *p;
4346 Py_ssize_t n = 0;
4347 int usize = 0;
4348
4349 assert(size >= 0);
4350
4351 /* Skip trailing lead-byte unless 'final' is set */
4352 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004354
4355 /* First get the size of the result */
4356 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4358 if (usize == 0) {
4359 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4360 return -1;
4361 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004362 }
4363
4364 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 /* Create unicode object */
4366 *v = _PyUnicode_New(usize);
4367 if (*v == NULL)
4368 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004369 }
4370 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 /* Extend unicode object */
4372 n = PyUnicode_GET_SIZE(*v);
4373 if (_PyUnicode_Resize(v, n + usize) < 0)
4374 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004375 }
4376
4377 /* Do the conversion */
4378 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 p = PyUnicode_AS_UNICODE(*v) + n;
4380 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4381 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4382 return -1;
4383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004384 }
4385
4386 return size;
4387}
4388
4389PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 Py_ssize_t size,
4391 const char *errors,
4392 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004393{
4394 PyUnicodeObject *v = NULL;
4395 int done;
4396
4397 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004399
4400#ifdef NEED_RETRY
4401 retry:
4402 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004404 else
4405#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004407
4408 if (done < 0) {
4409 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004411 }
4412
4413 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004415
4416#ifdef NEED_RETRY
4417 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 s += done;
4419 size -= done;
4420 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004421 }
4422#endif
4423
4424 return (PyObject *)v;
4425}
4426
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004427PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 Py_ssize_t size,
4429 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004430{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004431 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4432}
4433
4434/*
4435 * Convert unicode into string object (MBCS).
4436 * Returns 0 if succeed, -1 otherwise.
4437 */
4438static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 const Py_UNICODE *p, /* unicode */
4440 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004441{
4442 int mbcssize = 0;
4443 Py_ssize_t n = 0;
4444
4445 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004446
4447 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004448 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4450 if (mbcssize == 0) {
4451 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4452 return -1;
4453 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004454 }
4455
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004456 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 /* Create string object */
4458 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4459 if (*repr == NULL)
4460 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004461 }
4462 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 /* Extend string object */
4464 n = PyBytes_Size(*repr);
4465 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4466 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004467 }
4468
4469 /* Do the conversion */
4470 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 char *s = PyBytes_AS_STRING(*repr) + n;
4472 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4473 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4474 return -1;
4475 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004476 }
4477
4478 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004479}
4480
4481PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 Py_ssize_t size,
4483 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004484{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004485 PyObject *repr = NULL;
4486 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004487
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004488#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004490 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004492 else
4493#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004495
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004496 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 Py_XDECREF(repr);
4498 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004499 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004500
4501#ifdef NEED_RETRY
4502 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 p += INT_MAX;
4504 size -= INT_MAX;
4505 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004506 }
4507#endif
4508
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004509 return repr;
4510}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004511
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004512PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4513{
4514 if (!PyUnicode_Check(unicode)) {
4515 PyErr_BadArgument();
4516 return NULL;
4517 }
4518 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 PyUnicode_GET_SIZE(unicode),
4520 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004521}
4522
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004523#undef NEED_RETRY
4524
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004525#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004526
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527/* --- Character Mapping Codec -------------------------------------------- */
4528
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 Py_ssize_t size,
4531 PyObject *mapping,
4532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 Py_ssize_t startinpos;
4536 Py_ssize_t endinpos;
4537 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 PyUnicodeObject *v;
4540 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 PyObject *errorHandler = NULL;
4543 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004544 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004545 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004546
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 /* Default to Latin-1 */
4548 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
4551 v = _PyUnicode_New(size);
4552 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004558 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 mapstring = PyUnicode_AS_UNICODE(mapping);
4560 maplen = PyUnicode_GET_SIZE(mapping);
4561 while (s < e) {
4562 unsigned char ch = *s;
4563 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 if (ch < maplen)
4566 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 if (x == 0xfffe) {
4569 /* undefined mapping */
4570 outpos = p-PyUnicode_AS_UNICODE(v);
4571 startinpos = s-starts;
4572 endinpos = startinpos+1;
4573 if (unicode_decode_call_errorhandler(
4574 errors, &errorHandler,
4575 "charmap", "character maps to <undefined>",
4576 &starts, &e, &startinpos, &endinpos, &exc, &s,
4577 &v, &outpos, &p)) {
4578 goto onError;
4579 }
4580 continue;
4581 }
4582 *p++ = x;
4583 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004584 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004585 }
4586 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 while (s < e) {
4588 unsigned char ch = *s;
4589 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004590
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4592 w = PyLong_FromLong((long)ch);
4593 if (w == NULL)
4594 goto onError;
4595 x = PyObject_GetItem(mapping, w);
4596 Py_DECREF(w);
4597 if (x == NULL) {
4598 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4599 /* No mapping found means: mapping is undefined. */
4600 PyErr_Clear();
4601 x = Py_None;
4602 Py_INCREF(x);
4603 } else
4604 goto onError;
4605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004606
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 /* Apply mapping */
4608 if (PyLong_Check(x)) {
4609 long value = PyLong_AS_LONG(x);
4610 if (value < 0 || value > 65535) {
4611 PyErr_SetString(PyExc_TypeError,
4612 "character mapping must be in range(65536)");
4613 Py_DECREF(x);
4614 goto onError;
4615 }
4616 *p++ = (Py_UNICODE)value;
4617 }
4618 else if (x == Py_None) {
4619 /* undefined mapping */
4620 outpos = p-PyUnicode_AS_UNICODE(v);
4621 startinpos = s-starts;
4622 endinpos = startinpos+1;
4623 if (unicode_decode_call_errorhandler(
4624 errors, &errorHandler,
4625 "charmap", "character maps to <undefined>",
4626 &starts, &e, &startinpos, &endinpos, &exc, &s,
4627 &v, &outpos, &p)) {
4628 Py_DECREF(x);
4629 goto onError;
4630 }
4631 Py_DECREF(x);
4632 continue;
4633 }
4634 else if (PyUnicode_Check(x)) {
4635 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004636
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 if (targetsize == 1)
4638 /* 1-1 mapping */
4639 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004640
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 else if (targetsize > 1) {
4642 /* 1-n mapping */
4643 if (targetsize > extrachars) {
4644 /* resize first */
4645 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4646 Py_ssize_t needed = (targetsize - extrachars) + \
4647 (targetsize << 2);
4648 extrachars += needed;
4649 /* XXX overflow detection missing */
4650 if (_PyUnicode_Resize(&v,
4651 PyUnicode_GET_SIZE(v) + needed) < 0) {
4652 Py_DECREF(x);
4653 goto onError;
4654 }
4655 p = PyUnicode_AS_UNICODE(v) + oldpos;
4656 }
4657 Py_UNICODE_COPY(p,
4658 PyUnicode_AS_UNICODE(x),
4659 targetsize);
4660 p += targetsize;
4661 extrachars -= targetsize;
4662 }
4663 /* 1-0 mapping: skip the character */
4664 }
4665 else {
4666 /* wrong return value */
4667 PyErr_SetString(PyExc_TypeError,
4668 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004669 Py_DECREF(x);
4670 goto onError;
4671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 Py_DECREF(x);
4673 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 }
4676 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 Py_XDECREF(errorHandler);
4680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004682
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 Py_XDECREF(errorHandler);
4685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 Py_XDECREF(v);
4687 return NULL;
4688}
4689
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004690/* Charmap encoding: the lookup table */
4691
4692struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 PyObject_HEAD
4694 unsigned char level1[32];
4695 int count2, count3;
4696 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697};
4698
4699static PyObject*
4700encoding_map_size(PyObject *obj, PyObject* args)
4701{
4702 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004703 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004705}
4706
4707static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004708 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 PyDoc_STR("Return the size (in bytes) of this object") },
4710 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004711};
4712
4713static void
4714encoding_map_dealloc(PyObject* o)
4715{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004716 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004717}
4718
4719static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004720 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 "EncodingMap", /*tp_name*/
4722 sizeof(struct encoding_map), /*tp_basicsize*/
4723 0, /*tp_itemsize*/
4724 /* methods */
4725 encoding_map_dealloc, /*tp_dealloc*/
4726 0, /*tp_print*/
4727 0, /*tp_getattr*/
4728 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004729 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 0, /*tp_repr*/
4731 0, /*tp_as_number*/
4732 0, /*tp_as_sequence*/
4733 0, /*tp_as_mapping*/
4734 0, /*tp_hash*/
4735 0, /*tp_call*/
4736 0, /*tp_str*/
4737 0, /*tp_getattro*/
4738 0, /*tp_setattro*/
4739 0, /*tp_as_buffer*/
4740 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4741 0, /*tp_doc*/
4742 0, /*tp_traverse*/
4743 0, /*tp_clear*/
4744 0, /*tp_richcompare*/
4745 0, /*tp_weaklistoffset*/
4746 0, /*tp_iter*/
4747 0, /*tp_iternext*/
4748 encoding_map_methods, /*tp_methods*/
4749 0, /*tp_members*/
4750 0, /*tp_getset*/
4751 0, /*tp_base*/
4752 0, /*tp_dict*/
4753 0, /*tp_descr_get*/
4754 0, /*tp_descr_set*/
4755 0, /*tp_dictoffset*/
4756 0, /*tp_init*/
4757 0, /*tp_alloc*/
4758 0, /*tp_new*/
4759 0, /*tp_free*/
4760 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004761};
4762
4763PyObject*
4764PyUnicode_BuildEncodingMap(PyObject* string)
4765{
4766 Py_UNICODE *decode;
4767 PyObject *result;
4768 struct encoding_map *mresult;
4769 int i;
4770 int need_dict = 0;
4771 unsigned char level1[32];
4772 unsigned char level2[512];
4773 unsigned char *mlevel1, *mlevel2, *mlevel3;
4774 int count2 = 0, count3 = 0;
4775
4776 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4777 PyErr_BadArgument();
4778 return NULL;
4779 }
4780 decode = PyUnicode_AS_UNICODE(string);
4781 memset(level1, 0xFF, sizeof level1);
4782 memset(level2, 0xFF, sizeof level2);
4783
4784 /* If there isn't a one-to-one mapping of NULL to \0,
4785 or if there are non-BMP characters, we need to use
4786 a mapping dictionary. */
4787 if (decode[0] != 0)
4788 need_dict = 1;
4789 for (i = 1; i < 256; i++) {
4790 int l1, l2;
4791 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004792#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004793 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004794#endif
4795 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004796 need_dict = 1;
4797 break;
4798 }
4799 if (decode[i] == 0xFFFE)
4800 /* unmapped character */
4801 continue;
4802 l1 = decode[i] >> 11;
4803 l2 = decode[i] >> 7;
4804 if (level1[l1] == 0xFF)
4805 level1[l1] = count2++;
4806 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004808 }
4809
4810 if (count2 >= 0xFF || count3 >= 0xFF)
4811 need_dict = 1;
4812
4813 if (need_dict) {
4814 PyObject *result = PyDict_New();
4815 PyObject *key, *value;
4816 if (!result)
4817 return NULL;
4818 for (i = 0; i < 256; i++) {
4819 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004820 key = PyLong_FromLong(decode[i]);
4821 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004822 if (!key || !value)
4823 goto failed1;
4824 if (PyDict_SetItem(result, key, value) == -1)
4825 goto failed1;
4826 Py_DECREF(key);
4827 Py_DECREF(value);
4828 }
4829 return result;
4830 failed1:
4831 Py_XDECREF(key);
4832 Py_XDECREF(value);
4833 Py_DECREF(result);
4834 return NULL;
4835 }
4836
4837 /* Create a three-level trie */
4838 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4839 16*count2 + 128*count3 - 1);
4840 if (!result)
4841 return PyErr_NoMemory();
4842 PyObject_Init(result, &EncodingMapType);
4843 mresult = (struct encoding_map*)result;
4844 mresult->count2 = count2;
4845 mresult->count3 = count3;
4846 mlevel1 = mresult->level1;
4847 mlevel2 = mresult->level23;
4848 mlevel3 = mresult->level23 + 16*count2;
4849 memcpy(mlevel1, level1, 32);
4850 memset(mlevel2, 0xFF, 16*count2);
4851 memset(mlevel3, 0, 128*count3);
4852 count3 = 0;
4853 for (i = 1; i < 256; i++) {
4854 int o1, o2, o3, i2, i3;
4855 if (decode[i] == 0xFFFE)
4856 /* unmapped character */
4857 continue;
4858 o1 = decode[i]>>11;
4859 o2 = (decode[i]>>7) & 0xF;
4860 i2 = 16*mlevel1[o1] + o2;
4861 if (mlevel2[i2] == 0xFF)
4862 mlevel2[i2] = count3++;
4863 o3 = decode[i] & 0x7F;
4864 i3 = 128*mlevel2[i2] + o3;
4865 mlevel3[i3] = i;
4866 }
4867 return result;
4868}
4869
4870static int
4871encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4872{
4873 struct encoding_map *map = (struct encoding_map*)mapping;
4874 int l1 = c>>11;
4875 int l2 = (c>>7) & 0xF;
4876 int l3 = c & 0x7F;
4877 int i;
4878
4879#ifdef Py_UNICODE_WIDE
4880 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004882 }
4883#endif
4884 if (c == 0)
4885 return 0;
4886 /* level 1*/
4887 i = map->level1[l1];
4888 if (i == 0xFF) {
4889 return -1;
4890 }
4891 /* level 2*/
4892 i = map->level23[16*i+l2];
4893 if (i == 0xFF) {
4894 return -1;
4895 }
4896 /* level 3 */
4897 i = map->level23[16*map->count2 + 128*i + l3];
4898 if (i == 0) {
4899 return -1;
4900 }
4901 return i;
4902}
4903
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904/* Lookup the character ch in the mapping. If the character
4905 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004906 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
Christian Heimes217cfd12007-12-02 14:31:20 +00004909 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 PyObject *x;
4911
4912 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 x = PyObject_GetItem(mapping, w);
4915 Py_DECREF(w);
4916 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4918 /* No mapping found means: mapping is undefined. */
4919 PyErr_Clear();
4920 x = Py_None;
4921 Py_INCREF(x);
4922 return x;
4923 } else
4924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004926 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004928 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 long value = PyLong_AS_LONG(x);
4930 if (value < 0 || value > 255) {
4931 PyErr_SetString(PyExc_TypeError,
4932 "character mapping must be in range(256)");
4933 Py_DECREF(x);
4934 return NULL;
4935 }
4936 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004938 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 /* wrong return value */
4942 PyErr_Format(PyExc_TypeError,
4943 "character mapping must return integer, bytes or None, not %.400s",
4944 x->ob_type->tp_name);
4945 Py_DECREF(x);
4946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 }
4948}
4949
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004950static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004951charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004952{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004953 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4954 /* exponentially overallocate to minimize reallocations */
4955 if (requiredsize < 2*outsize)
4956 requiredsize = 2*outsize;
4957 if (_PyBytes_Resize(outobj, requiredsize))
4958 return -1;
4959 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004960}
4961
Benjamin Peterson14339b62009-01-31 16:36:08 +00004962typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004964}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004966 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 space is available. Return a new reference to the object that
4968 was put in the output buffer, or Py_None, if the mapping was undefined
4969 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004970 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004972charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004974{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004975 PyObject *rep;
4976 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004977 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978
Christian Heimes90aa7642007-12-19 02:45:37 +00004979 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004980 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004982 if (res == -1)
4983 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 if (outsize<requiredsize)
4985 if (charmapencode_resize(outobj, outpos, requiredsize))
4986 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004987 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 outstart[(*outpos)++] = (char)res;
4989 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004990 }
4991
4992 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004995 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 Py_DECREF(rep);
4997 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004998 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 if (PyLong_Check(rep)) {
5000 Py_ssize_t requiredsize = *outpos+1;
5001 if (outsize<requiredsize)
5002 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5003 Py_DECREF(rep);
5004 return enc_EXCEPTION;
5005 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005006 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005008 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 else {
5010 const char *repchars = PyBytes_AS_STRING(rep);
5011 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5012 Py_ssize_t requiredsize = *outpos+repsize;
5013 if (outsize<requiredsize)
5014 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5015 Py_DECREF(rep);
5016 return enc_EXCEPTION;
5017 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005018 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 memcpy(outstart + *outpos, repchars, repsize);
5020 *outpos += repsize;
5021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005023 Py_DECREF(rep);
5024 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025}
5026
5027/* handle an error in PyUnicode_EncodeCharmap
5028 Return 0 on success, -1 on error */
5029static
5030int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005031 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005033 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005034 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035{
5036 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005037 Py_ssize_t repsize;
5038 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039 Py_UNICODE *uni2;
5040 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005041 Py_ssize_t collstartpos = *inpos;
5042 Py_ssize_t collendpos = *inpos+1;
5043 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 char *encoding = "charmap";
5045 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005046 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 /* find all unencodable characters */
5049 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005050 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005051 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 int res = encoding_map_lookup(p[collendpos], mapping);
5053 if (res != -1)
5054 break;
5055 ++collendpos;
5056 continue;
5057 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005058
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 rep = charmapencode_lookup(p[collendpos], mapping);
5060 if (rep==NULL)
5061 return -1;
5062 else if (rep!=Py_None) {
5063 Py_DECREF(rep);
5064 break;
5065 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005066 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 }
5069 /* cache callback name lookup
5070 * (if not done yet, i.e. it's the first error) */
5071 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 if ((errors==NULL) || (!strcmp(errors, "strict")))
5073 *known_errorHandler = 1;
5074 else if (!strcmp(errors, "replace"))
5075 *known_errorHandler = 2;
5076 else if (!strcmp(errors, "ignore"))
5077 *known_errorHandler = 3;
5078 else if (!strcmp(errors, "xmlcharrefreplace"))
5079 *known_errorHandler = 4;
5080 else
5081 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 }
5083 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005084 case 1: /* strict */
5085 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5086 return -1;
5087 case 2: /* replace */
5088 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 x = charmapencode_output('?', mapping, res, respos);
5090 if (x==enc_EXCEPTION) {
5091 return -1;
5092 }
5093 else if (x==enc_FAILED) {
5094 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5095 return -1;
5096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005097 }
5098 /* fall through */
5099 case 3: /* ignore */
5100 *inpos = collendpos;
5101 break;
5102 case 4: /* xmlcharrefreplace */
5103 /* generate replacement (temporarily (mis)uses p) */
5104 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 char buffer[2+29+1+1];
5106 char *cp;
5107 sprintf(buffer, "&#%d;", (int)p[collpos]);
5108 for (cp = buffer; *cp; ++cp) {
5109 x = charmapencode_output(*cp, mapping, res, respos);
5110 if (x==enc_EXCEPTION)
5111 return -1;
5112 else if (x==enc_FAILED) {
5113 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5114 return -1;
5115 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005116 }
5117 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005118 *inpos = collendpos;
5119 break;
5120 default:
5121 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 encoding, reason, p, size, exceptionObject,
5123 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005124 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 return -1;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005126 if (!PyUnicode_Check(repunicode)) {
5127 /* Implementation limitation: byte results not supported yet. */
5128 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5129 Py_DECREF(repunicode);
5130 return -1;
5131 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005132 /* generate replacement */
5133 repsize = PyUnicode_GET_SIZE(repunicode);
5134 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 x = charmapencode_output(*uni2, mapping, res, respos);
5136 if (x==enc_EXCEPTION) {
5137 return -1;
5138 }
5139 else if (x==enc_FAILED) {
5140 Py_DECREF(repunicode);
5141 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5142 return -1;
5143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005144 }
5145 *inpos = newpos;
5146 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 }
5148 return 0;
5149}
5150
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 Py_ssize_t size,
5153 PyObject *mapping,
5154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 /* output object */
5157 PyObject *res = NULL;
5158 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005159 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 PyObject *errorHandler = NULL;
5163 PyObject *exc = NULL;
5164 /* the following variable is used for caching string comparisons
5165 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5166 * 3=ignore, 4=xmlcharrefreplace */
5167 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168
5169 /* Default to Latin-1 */
5170 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005173 /* allocate enough for a simple encoding without
5174 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005175 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005176 if (res == NULL)
5177 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005178 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 /* try to encode it */
5183 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5184 if (x==enc_EXCEPTION) /* error */
5185 goto onError;
5186 if (x==enc_FAILED) { /* unencodable character */
5187 if (charmap_encoding_error(p, size, &inpos, mapping,
5188 &exc,
5189 &known_errorHandler, &errorHandler, errors,
5190 &res, &respos)) {
5191 goto onError;
5192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 else
5195 /* done with this character => adjust input position */
5196 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005200 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005201 if (_PyBytes_Resize(&res, respos) < 0)
5202 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 Py_XDECREF(exc);
5205 Py_XDECREF(errorHandler);
5206 return res;
5207
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 Py_XDECREF(res);
5210 Py_XDECREF(exc);
5211 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 return NULL;
5213}
5214
5215PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
5218 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 PyErr_BadArgument();
5220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 }
5222 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 PyUnicode_GET_SIZE(unicode),
5224 mapping,
5225 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226}
5227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228/* create or adjust a UnicodeTranslateError */
5229static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 const Py_UNICODE *unicode, Py_ssize_t size,
5231 Py_ssize_t startpos, Py_ssize_t endpos,
5232 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005235 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 }
5238 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5240 goto onError;
5241 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5242 goto onError;
5243 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5244 goto onError;
5245 return;
5246 onError:
5247 Py_DECREF(*exceptionObject);
5248 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 }
5250}
5251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252/* raises a UnicodeTranslateError */
5253static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 const Py_UNICODE *unicode, Py_ssize_t size,
5255 Py_ssize_t startpos, Py_ssize_t endpos,
5256 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005257{
5258 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005259 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005260 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262}
5263
5264/* error handling callback helper:
5265 build arguments, call the callback and check the arguments,
5266 put the result into newpos and return the replacement string, which
5267 has to be freed by the caller */
5268static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 PyObject **errorHandler,
5270 const char *reason,
5271 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5272 Py_ssize_t startpos, Py_ssize_t endpos,
5273 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005275 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005276
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005277 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005278 PyObject *restuple;
5279 PyObject *resunicode;
5280
5281 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 }
5286
5287 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005291
5292 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005297 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 Py_DECREF(restuple);
5299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300 }
5301 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 &resunicode, &i_newpos)) {
5303 Py_DECREF(restuple);
5304 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 else
5309 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005310 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5312 Py_DECREF(restuple);
5313 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005314 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005315 Py_INCREF(resunicode);
5316 Py_DECREF(restuple);
5317 return resunicode;
5318}
5319
5320/* Lookup the character ch in the mapping and put the result in result,
5321 which must be decrefed by the caller.
5322 Return 0 on success, -1 on error */
5323static
5324int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5325{
Christian Heimes217cfd12007-12-02 14:31:20 +00005326 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327 PyObject *x;
5328
5329 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331 x = PyObject_GetItem(mapping, w);
5332 Py_DECREF(w);
5333 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5335 /* No mapping found means: use 1:1 mapping. */
5336 PyErr_Clear();
5337 *result = NULL;
5338 return 0;
5339 } else
5340 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 }
5342 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 *result = x;
5344 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005346 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 long value = PyLong_AS_LONG(x);
5348 long max = PyUnicode_GetMax();
5349 if (value < 0 || value > max) {
5350 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005351 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 Py_DECREF(x);
5353 return -1;
5354 }
5355 *result = x;
5356 return 0;
5357 }
5358 else if (PyUnicode_Check(x)) {
5359 *result = x;
5360 return 0;
5361 }
5362 else {
5363 /* wrong return value */
5364 PyErr_SetString(PyExc_TypeError,
5365 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 Py_DECREF(x);
5367 return -1;
5368 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369}
5370/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 if not reallocate and adjust various state variables.
5372 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373static
Walter Dörwald4894c302003-10-24 14:25:28 +00005374int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005378 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 /* remember old output position */
5380 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5381 /* exponentially overallocate to minimize reallocations */
5382 if (requiredsize < 2 * oldsize)
5383 requiredsize = 2 * oldsize;
5384 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5385 return -1;
5386 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 }
5388 return 0;
5389}
5390/* lookup the character, put the result in the output string and adjust
5391 various state variables. Return a new reference to the object that
5392 was put in the output buffer in *result, or Py_None, if the mapping was
5393 undefined (in which case no character was written).
5394 The called must decref result.
5395 Return 0 on success, -1 on error. */
5396static
Walter Dörwald4894c302003-10-24 14:25:28 +00005397int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5399 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400{
Walter Dörwald4894c302003-10-24 14:25:28 +00005401 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 /* not found => default to 1:1 mapping */
5405 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 }
5407 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005409 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 /* no overflow check, because we know that the space is enough */
5411 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412 }
5413 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5415 if (repsize==1) {
5416 /* no overflow check, because we know that the space is enough */
5417 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5418 }
5419 else if (repsize!=0) {
5420 /* more than one character */
5421 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5422 (insize - (curinp-startinp)) +
5423 repsize - 1;
5424 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5425 return -1;
5426 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5427 *outp += repsize;
5428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 }
5430 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 return 0;
5433}
5434
5435PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 Py_ssize_t size,
5437 PyObject *mapping,
5438 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 /* output object */
5441 PyObject *res = NULL;
5442 /* pointers to the beginning and end+1 of input */
5443 const Py_UNICODE *startp = p;
5444 const Py_UNICODE *endp = p + size;
5445 /* pointer into the output */
5446 Py_UNICODE *str;
5447 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005448 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 char *reason = "character maps to <undefined>";
5450 PyObject *errorHandler = NULL;
5451 PyObject *exc = NULL;
5452 /* the following variable is used for caching string comparisons
5453 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5454 * 3=ignore, 4=xmlcharrefreplace */
5455 int known_errorHandler = -1;
5456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 PyErr_BadArgument();
5459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461
5462 /* allocate enough for a simple 1:1 translation without
5463 replacements, if we need more, we'll resize */
5464 res = PyUnicode_FromUnicode(NULL, size);
5465 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 /* try to encode it */
5473 PyObject *x = NULL;
5474 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5475 Py_XDECREF(x);
5476 goto onError;
5477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005478 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 if (x!=Py_None) /* it worked => adjust input pointer */
5480 ++p;
5481 else { /* untranslatable character */
5482 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5483 Py_ssize_t repsize;
5484 Py_ssize_t newpos;
5485 Py_UNICODE *uni2;
5486 /* startpos for collecting untranslatable chars */
5487 const Py_UNICODE *collstart = p;
5488 const Py_UNICODE *collend = p+1;
5489 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* find all untranslatable characters */
5492 while (collend < endp) {
5493 if (charmaptranslate_lookup(*collend, mapping, &x))
5494 goto onError;
5495 Py_XDECREF(x);
5496 if (x!=Py_None)
5497 break;
5498 ++collend;
5499 }
5500 /* cache callback name lookup
5501 * (if not done yet, i.e. it's the first error) */
5502 if (known_errorHandler==-1) {
5503 if ((errors==NULL) || (!strcmp(errors, "strict")))
5504 known_errorHandler = 1;
5505 else if (!strcmp(errors, "replace"))
5506 known_errorHandler = 2;
5507 else if (!strcmp(errors, "ignore"))
5508 known_errorHandler = 3;
5509 else if (!strcmp(errors, "xmlcharrefreplace"))
5510 known_errorHandler = 4;
5511 else
5512 known_errorHandler = 0;
5513 }
5514 switch (known_errorHandler) {
5515 case 1: /* strict */
5516 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005517 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 case 2: /* replace */
5519 /* No need to check for space, this is a 1:1 replacement */
5520 for (coll = collstart; coll<collend; ++coll)
5521 *str++ = '?';
5522 /* fall through */
5523 case 3: /* ignore */
5524 p = collend;
5525 break;
5526 case 4: /* xmlcharrefreplace */
5527 /* generate replacement (temporarily (mis)uses p) */
5528 for (p = collstart; p < collend; ++p) {
5529 char buffer[2+29+1+1];
5530 char *cp;
5531 sprintf(buffer, "&#%d;", (int)*p);
5532 if (charmaptranslate_makespace(&res, &str,
5533 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5534 goto onError;
5535 for (cp = buffer; *cp; ++cp)
5536 *str++ = *cp;
5537 }
5538 p = collend;
5539 break;
5540 default:
5541 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5542 reason, startp, size, &exc,
5543 collstart-startp, collend-startp, &newpos);
5544 if (repunicode == NULL)
5545 goto onError;
5546 /* generate replacement */
5547 repsize = PyUnicode_GET_SIZE(repunicode);
5548 if (charmaptranslate_makespace(&res, &str,
5549 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5550 Py_DECREF(repunicode);
5551 goto onError;
5552 }
5553 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5554 *str++ = *uni2;
5555 p = startp + newpos;
5556 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005557 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005558 }
5559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 /* Resize if we allocated to much */
5561 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005562 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 if (PyUnicode_Resize(&res, respos) < 0)
5564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 }
5566 Py_XDECREF(exc);
5567 Py_XDECREF(errorHandler);
5568 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 Py_XDECREF(res);
5572 Py_XDECREF(exc);
5573 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return NULL;
5575}
5576
5577PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 PyObject *mapping,
5579 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580{
5581 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005582
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 str = PyUnicode_FromObject(str);
5584 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 PyUnicode_GET_SIZE(str),
5588 mapping,
5589 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 Py_DECREF(str);
5591 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 Py_XDECREF(str);
5595 return NULL;
5596}
Tim Petersced69f82003-09-16 20:30:58 +00005597
Guido van Rossum9e896b32000-04-05 20:11:21 +00005598/* --- Decimal Encoder ---------------------------------------------------- */
5599
5600int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 Py_ssize_t length,
5602 char *output,
5603 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005604{
5605 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 PyObject *errorHandler = NULL;
5607 PyObject *exc = NULL;
5608 const char *encoding = "decimal";
5609 const char *reason = "invalid decimal Unicode string";
5610 /* the following variable is used for caching string comparisons
5611 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5612 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005613
5614 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 PyErr_BadArgument();
5616 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005617 }
5618
5619 p = s;
5620 end = s + length;
5621 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 register Py_UNICODE ch = *p;
5623 int decimal;
5624 PyObject *repunicode;
5625 Py_ssize_t repsize;
5626 Py_ssize_t newpos;
5627 Py_UNICODE *uni2;
5628 Py_UNICODE *collstart;
5629 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005630
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005632 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 ++p;
5634 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005635 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 decimal = Py_UNICODE_TODECIMAL(ch);
5637 if (decimal >= 0) {
5638 *output++ = '0' + decimal;
5639 ++p;
5640 continue;
5641 }
5642 if (0 < ch && ch < 256) {
5643 *output++ = (char)ch;
5644 ++p;
5645 continue;
5646 }
5647 /* All other characters are considered unencodable */
5648 collstart = p;
5649 collend = p+1;
5650 while (collend < end) {
5651 if ((0 < *collend && *collend < 256) ||
5652 !Py_UNICODE_ISSPACE(*collend) ||
5653 Py_UNICODE_TODECIMAL(*collend))
5654 break;
5655 }
5656 /* cache callback name lookup
5657 * (if not done yet, i.e. it's the first error) */
5658 if (known_errorHandler==-1) {
5659 if ((errors==NULL) || (!strcmp(errors, "strict")))
5660 known_errorHandler = 1;
5661 else if (!strcmp(errors, "replace"))
5662 known_errorHandler = 2;
5663 else if (!strcmp(errors, "ignore"))
5664 known_errorHandler = 3;
5665 else if (!strcmp(errors, "xmlcharrefreplace"))
5666 known_errorHandler = 4;
5667 else
5668 known_errorHandler = 0;
5669 }
5670 switch (known_errorHandler) {
5671 case 1: /* strict */
5672 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5673 goto onError;
5674 case 2: /* replace */
5675 for (p = collstart; p < collend; ++p)
5676 *output++ = '?';
5677 /* fall through */
5678 case 3: /* ignore */
5679 p = collend;
5680 break;
5681 case 4: /* xmlcharrefreplace */
5682 /* generate replacement (temporarily (mis)uses p) */
5683 for (p = collstart; p < collend; ++p)
5684 output += sprintf(output, "&#%d;", (int)*p);
5685 p = collend;
5686 break;
5687 default:
5688 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5689 encoding, reason, s, length, &exc,
5690 collstart-s, collend-s, &newpos);
5691 if (repunicode == NULL)
5692 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005693 if (!PyUnicode_Check(repunicode)) {
5694 /* Implementation limitation: byte results not supported yet. */
5695 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5696 Py_DECREF(repunicode);
5697 goto onError;
5698 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 /* generate replacement */
5700 repsize = PyUnicode_GET_SIZE(repunicode);
5701 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5702 Py_UNICODE ch = *uni2;
5703 if (Py_UNICODE_ISSPACE(ch))
5704 *output++ = ' ';
5705 else {
5706 decimal = Py_UNICODE_TODECIMAL(ch);
5707 if (decimal >= 0)
5708 *output++ = '0' + decimal;
5709 else if (0 < ch && ch < 256)
5710 *output++ = (char)ch;
5711 else {
5712 Py_DECREF(repunicode);
5713 raise_encode_exception(&exc, encoding,
5714 s, length, collstart-s, collend-s, reason);
5715 goto onError;
5716 }
5717 }
5718 }
5719 p = s + newpos;
5720 Py_DECREF(repunicode);
5721 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005722 }
5723 /* 0-terminate the output string */
5724 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 Py_XDECREF(exc);
5726 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005727 return 0;
5728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 Py_XDECREF(exc);
5731 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005732 return -1;
5733}
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735/* --- Helpers ------------------------------------------------------------ */
5736
Eric Smith8c663262007-08-25 02:26:07 +00005737#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005738#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005739#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005740/* Include _ParseTupleFinds from find.h */
5741#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005742#include "stringlib/find.h"
5743#include "stringlib/partition.h"
5744
Eric Smith5807c412008-05-11 21:00:57 +00005745#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005746#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005747#include "stringlib/localeutil.h"
5748
Thomas Wouters477c8d52006-05-27 19:21:47 +00005749/* helper macro to fixup start/end slice values */
5750#define FIX_START_END(obj) \
5751 if (start < 0) \
5752 start += (obj)->length; \
5753 if (start < 0) \
5754 start = 0; \
5755 if (end > (obj)->length) \
5756 end = (obj)->length; \
5757 if (end < 0) \
5758 end += (obj)->length; \
5759 if (end < 0) \
5760 end = 0;
5761
Martin v. Löwis18e16552006-02-15 17:27:45 +00005762Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005763 PyObject *substr,
5764 Py_ssize_t start,
5765 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005767 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005768 PyUnicodeObject* str_obj;
5769 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005770
Thomas Wouters477c8d52006-05-27 19:21:47 +00005771 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5772 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5775 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 Py_DECREF(str_obj);
5777 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 }
Tim Petersced69f82003-09-16 20:30:58 +00005779
Thomas Wouters477c8d52006-05-27 19:21:47 +00005780 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005781
Thomas Wouters477c8d52006-05-27 19:21:47 +00005782 result = stringlib_count(
5783 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5784 );
5785
5786 Py_DECREF(sub_obj);
5787 Py_DECREF(str_obj);
5788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 return result;
5790}
5791
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005793 PyObject *sub,
5794 Py_ssize_t start,
5795 Py_ssize_t end,
5796 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005799
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005801 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005803 sub = PyUnicode_FromObject(sub);
5804 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 Py_DECREF(str);
5806 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 }
Tim Petersced69f82003-09-16 20:30:58 +00005808
Thomas Wouters477c8d52006-05-27 19:21:47 +00005809 if (direction > 0)
5810 result = stringlib_find_slice(
5811 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5812 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5813 start, end
5814 );
5815 else
5816 result = stringlib_rfind_slice(
5817 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5818 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5819 start, end
5820 );
5821
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 Py_DECREF(sub);
5824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 return result;
5826}
5827
Tim Petersced69f82003-09-16 20:30:58 +00005828static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 PyUnicodeObject *substring,
5831 Py_ssize_t start,
5832 Py_ssize_t end,
5833 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 if (substring->length == 0)
5836 return 1;
5837
Thomas Wouters477c8d52006-05-27 19:21:47 +00005838 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 end -= substring->length;
5841 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
5844 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 if (Py_UNICODE_MATCH(self, end, substring))
5846 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 } else {
5848 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
5851
5852 return 0;
5853}
5854
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 PyObject *substr,
5857 Py_ssize_t start,
5858 Py_ssize_t end,
5859 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005861 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 str = PyUnicode_FromObject(str);
5864 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 substr = PyUnicode_FromObject(substr);
5867 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 Py_DECREF(str);
5869 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Tim Petersced69f82003-09-16 20:30:58 +00005871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 (PyUnicodeObject *)substr,
5874 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 Py_DECREF(str);
5876 Py_DECREF(substr);
5877 return result;
5878}
5879
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880/* Apply fixfct filter to the Unicode object self and return a
5881 reference to the modified object */
5882
Tim Petersced69f82003-09-16 20:30:58 +00005883static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
5887
5888 PyUnicodeObject *u;
5889
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005890 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005893
5894 Py_UNICODE_COPY(u->str, self->str, self->length);
5895
Tim Peters7a29bd52001-09-12 03:03:31 +00005896 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 /* fixfct should return TRUE if it modified the buffer. If
5898 FALSE, return a reference to the original buffer instead
5899 (to save space, not time) */
5900 Py_INCREF(self);
5901 Py_DECREF(u);
5902 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 }
5904 return (PyObject*) u;
5905}
5906
Tim Petersced69f82003-09-16 20:30:58 +00005907static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908int fixupper(PyUnicodeObject *self)
5909{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 Py_UNICODE *s = self->str;
5912 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 ch = Py_UNICODE_TOUPPER(*s);
5918 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 *s = ch;
5921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 s++;
5923 }
5924
5925 return status;
5926}
5927
Tim Petersced69f82003-09-16 20:30:58 +00005928static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929int fixlower(PyUnicodeObject *self)
5930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005931 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 Py_UNICODE *s = self->str;
5933 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005937
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 ch = Py_UNICODE_TOLOWER(*s);
5939 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 *s = ch;
5942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 s++;
5944 }
5945
5946 return status;
5947}
5948
Tim Petersced69f82003-09-16 20:30:58 +00005949static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950int fixswapcase(PyUnicodeObject *self)
5951{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005952 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 Py_UNICODE *s = self->str;
5954 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005955
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 while (len-- > 0) {
5957 if (Py_UNICODE_ISUPPER(*s)) {
5958 *s = Py_UNICODE_TOLOWER(*s);
5959 status = 1;
5960 } else if (Py_UNICODE_ISLOWER(*s)) {
5961 *s = Py_UNICODE_TOUPPER(*s);
5962 status = 1;
5963 }
5964 s++;
5965 }
5966
5967 return status;
5968}
5969
Tim Petersced69f82003-09-16 20:30:58 +00005970static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971int fixcapitalize(PyUnicodeObject *self)
5972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005974 Py_UNICODE *s = self->str;
5975 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005976
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005977 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005979 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 *s = Py_UNICODE_TOUPPER(*s);
5981 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005983 s++;
5984 while (--len > 0) {
5985 if (Py_UNICODE_ISUPPER(*s)) {
5986 *s = Py_UNICODE_TOLOWER(*s);
5987 status = 1;
5988 }
5989 s++;
5990 }
5991 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992}
5993
5994static
5995int fixtitle(PyUnicodeObject *self)
5996{
5997 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5998 register Py_UNICODE *e;
5999 int previous_is_cased;
6000
6001 /* Shortcut for single character strings */
6002 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6004 if (*p != ch) {
6005 *p = ch;
6006 return 1;
6007 }
6008 else
6009 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 }
Tim Petersced69f82003-09-16 20:30:58 +00006011
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 e = p + PyUnicode_GET_SIZE(self);
6013 previous_is_cased = 0;
6014 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 if (previous_is_cased)
6018 *p = Py_UNICODE_TOLOWER(ch);
6019 else
6020 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 if (Py_UNICODE_ISLOWER(ch) ||
6023 Py_UNICODE_ISUPPER(ch) ||
6024 Py_UNICODE_ISTITLE(ch))
6025 previous_is_cased = 1;
6026 else
6027 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 }
6029 return 1;
6030}
6031
Tim Peters8ce9f162004-08-27 01:49:32 +00006032PyObject *
6033PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
Skip Montanaro6543b452004-09-16 03:28:13 +00006035 const Py_UNICODE blank = ' ';
6036 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006037 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006038 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006039 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6040 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006041 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6042 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006043 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006044 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Tim Peters05eba1f2004-08-27 21:32:02 +00006046 fseq = PySequence_Fast(seq, "");
6047 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006048 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006049 }
6050
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006051 /* NOTE: the following code can't call back into Python code,
6052 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006053 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006054
Tim Peters05eba1f2004-08-27 21:32:02 +00006055 seqlen = PySequence_Fast_GET_SIZE(fseq);
6056 /* If empty sequence, return u"". */
6057 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006058 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6059 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006060 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006061 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006062 /* If singleton sequence with an exact Unicode, return that. */
6063 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 item = items[0];
6065 if (PyUnicode_CheckExact(item)) {
6066 Py_INCREF(item);
6067 res = (PyUnicodeObject *)item;
6068 goto Done;
6069 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006070 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006071 else {
6072 /* Set up sep and seplen */
6073 if (separator == NULL) {
6074 sep = &blank;
6075 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006076 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006077 else {
6078 if (!PyUnicode_Check(separator)) {
6079 PyErr_Format(PyExc_TypeError,
6080 "separator: expected str instance,"
6081 " %.80s found",
6082 Py_TYPE(separator)->tp_name);
6083 goto onError;
6084 }
6085 sep = PyUnicode_AS_UNICODE(separator);
6086 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006087 }
6088 }
6089
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006090 /* There are at least two things to join, or else we have a subclass
6091 * of str in the sequence.
6092 * Do a pre-pass to figure out the total amount of space we'll
6093 * need (sz), and see whether all argument are strings.
6094 */
6095 sz = 0;
6096 for (i = 0; i < seqlen; i++) {
6097 const Py_ssize_t old_sz = sz;
6098 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 if (!PyUnicode_Check(item)) {
6100 PyErr_Format(PyExc_TypeError,
6101 "sequence item %zd: expected str instance,"
6102 " %.80s found",
6103 i, Py_TYPE(item)->tp_name);
6104 goto onError;
6105 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006106 sz += PyUnicode_GET_SIZE(item);
6107 if (i != 0)
6108 sz += seplen;
6109 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6110 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006112 goto onError;
6113 }
6114 }
Tim Petersced69f82003-09-16 20:30:58 +00006115
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006116 res = _PyUnicode_New(sz);
6117 if (res == NULL)
6118 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006119
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006120 /* Catenate everything. */
6121 res_p = PyUnicode_AS_UNICODE(res);
6122 for (i = 0; i < seqlen; ++i) {
6123 Py_ssize_t itemlen;
6124 item = items[i];
6125 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 /* Copy item, and maybe the separator. */
6127 if (i) {
6128 Py_UNICODE_COPY(res_p, sep, seplen);
6129 res_p += seplen;
6130 }
6131 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6132 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006133 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006136 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 return (PyObject *)res;
6138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006140 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006141 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 return NULL;
6143}
6144
Tim Petersced69f82003-09-16 20:30:58 +00006145static
6146PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 Py_ssize_t left,
6148 Py_ssize_t right,
6149 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150{
6151 PyUnicodeObject *u;
6152
6153 if (left < 0)
6154 left = 0;
6155 if (right < 0)
6156 right = 0;
6157
Tim Peters7a29bd52001-09-12 03:03:31 +00006158 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 Py_INCREF(self);
6160 return self;
6161 }
6162
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006163 if (left > PY_SSIZE_T_MAX - self->length ||
6164 right > PY_SSIZE_T_MAX - (left + self->length)) {
6165 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6166 return NULL;
6167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 u = _PyUnicode_New(left + self->length + right);
6169 if (u) {
6170 if (left)
6171 Py_UNICODE_FILL(u->str, fill, left);
6172 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6173 if (right)
6174 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6175 }
6176
6177 return u;
6178}
6179
Benjamin Peterson29060642009-01-31 22:14:21 +00006180#define SPLIT_APPEND(data, left, right) \
6181 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6182 if (!str) \
6183 goto onError; \
6184 if (PyList_Append(list, str)) { \
6185 Py_DECREF(str); \
6186 goto onError; \
6187 } \
6188 else \
6189 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
6191static
6192PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 PyObject *list,
6194 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 register Py_ssize_t i;
6197 register Py_ssize_t j;
6198 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006200 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201
6202 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006204 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006206 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6208 i++;
6209 if (j < i) {
6210 if (maxcount-- <= 0)
6211 break;
6212 SPLIT_APPEND(buf, j, i);
6213 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6214 i++;
6215 j = i;
6216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 }
6218 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
6221 return list;
6222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 Py_DECREF(list);
6225 return NULL;
6226}
6227
6228PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006231 register Py_ssize_t i;
6232 register Py_ssize_t j;
6233 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 PyObject *list;
6235 PyObject *str;
6236 Py_UNICODE *data;
6237
6238 string = PyUnicode_FromObject(string);
6239 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 data = PyUnicode_AS_UNICODE(string);
6242 len = PyUnicode_GET_SIZE(string);
6243
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 list = PyList_New(0);
6245 if (!list)
6246 goto onError;
6247
6248 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* Find a line and append it */
6252 while (i < len && !BLOOM_LINEBREAK(data[i]))
6253 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006256 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 if (i < len) {
6258 if (data[i] == '\r' && i + 1 < len &&
6259 data[i+1] == '\n')
6260 i += 2;
6261 else
6262 i++;
6263 if (keepends)
6264 eol = i;
6265 }
6266 SPLIT_APPEND(data, j, eol);
6267 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 }
6269 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 }
6272
6273 Py_DECREF(string);
6274 return list;
6275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006277 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 Py_DECREF(string);
6279 return NULL;
6280}
6281
Tim Petersced69f82003-09-16 20:30:58 +00006282static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 PyObject *list,
6285 Py_UNICODE ch,
6286 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006288 register Py_ssize_t i;
6289 register Py_ssize_t j;
6290 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006292 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293
6294 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 if (buf[i] == ch) {
6296 if (maxcount-- <= 0)
6297 break;
6298 SPLIT_APPEND(buf, j, i);
6299 i = j = i + 1;
6300 } else
6301 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 }
6303 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 }
6306 return list;
6307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 Py_DECREF(list);
6310 return NULL;
6311}
6312
Tim Petersced69f82003-09-16 20:30:58 +00006313static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 PyObject *list,
6316 PyUnicodeObject *substring,
6317 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319 register Py_ssize_t i;
6320 register Py_ssize_t j;
6321 Py_ssize_t len = self->length;
6322 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 PyObject *str;
6324
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006325 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 if (Py_UNICODE_MATCH(self, i, substring)) {
6327 if (maxcount-- <= 0)
6328 break;
6329 SPLIT_APPEND(self->str, j, i);
6330 i = j = i + sublen;
6331 } else
6332 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 }
6334 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 }
6337 return list;
6338
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 Py_DECREF(list);
6341 return NULL;
6342}
6343
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006344static
6345PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 PyObject *list,
6347 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006348{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006349 register Py_ssize_t i;
6350 register Py_ssize_t j;
6351 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006352 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006353 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006354
6355 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006357 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006359 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6361 i--;
6362 if (j > i) {
6363 if (maxcount-- <= 0)
6364 break;
6365 SPLIT_APPEND(buf, i + 1, j + 1);
6366 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6367 i--;
6368 j = i;
6369 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006370 }
6371 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006374 if (PyList_Reverse(list) < 0)
6375 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006376 return list;
6377
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006379 Py_DECREF(list);
6380 return NULL;
6381}
6382
Benjamin Peterson14339b62009-01-31 16:36:08 +00006383static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006384PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 PyObject *list,
6386 Py_UNICODE ch,
6387 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 register Py_ssize_t i;
6390 register Py_ssize_t j;
6391 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006392 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006393 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006394
6395 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (buf[i] == ch) {
6397 if (maxcount-- <= 0)
6398 break;
6399 SPLIT_APPEND(buf, i + 1, j + 1);
6400 j = i = i - 1;
6401 } else
6402 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006403 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006404 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006406 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006407 if (PyList_Reverse(list) < 0)
6408 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006409 return list;
6410
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006412 Py_DECREF(list);
6413 return NULL;
6414}
6415
Benjamin Peterson14339b62009-01-31 16:36:08 +00006416static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006417PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 PyObject *list,
6419 PyUnicodeObject *substring,
6420 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006422 register Py_ssize_t i;
6423 register Py_ssize_t j;
6424 Py_ssize_t len = self->length;
6425 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006426 PyObject *str;
6427
6428 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 if (Py_UNICODE_MATCH(self, i, substring)) {
6430 if (maxcount-- <= 0)
6431 break;
6432 SPLIT_APPEND(self->str, i + sublen, j);
6433 j = i;
6434 i -= sublen;
6435 } else
6436 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006437 }
6438 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006440 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006441 if (PyList_Reverse(list) < 0)
6442 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006443 return list;
6444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006446 Py_DECREF(list);
6447 return NULL;
6448}
6449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450#undef SPLIT_APPEND
6451
6452static
6453PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 PyUnicodeObject *substring,
6455 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456{
6457 PyObject *list;
6458
6459 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006460 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
6462 list = PyList_New(0);
6463 if (!list)
6464 return NULL;
6465
6466 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468
6469 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 Py_DECREF(list);
6474 PyErr_SetString(PyExc_ValueError, "empty separator");
6475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
6477 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479}
6480
Tim Petersced69f82003-09-16 20:30:58 +00006481static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006482PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 PyUnicodeObject *substring,
6484 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006485{
6486 PyObject *list;
6487
6488 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006489 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006490
6491 list = PyList_New(0);
6492 if (!list)
6493 return NULL;
6494
6495 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006497
6498 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006500
6501 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 Py_DECREF(list);
6503 PyErr_SetString(PyExc_ValueError, "empty separator");
6504 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006505 }
6506 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006508}
6509
6510static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 PyUnicodeObject *str1,
6513 PyUnicodeObject *str2,
6514 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515{
6516 PyUnicodeObject *u;
6517
6518 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
Thomas Wouters477c8d52006-05-27 19:21:47 +00006521 if (str1->length == str2->length) {
6522 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006523 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 if (str1->length == 1) {
6525 /* replace characters */
6526 Py_UNICODE u1, u2;
6527 if (!findchar(self->str, self->length, str1->str[0]))
6528 goto nothing;
6529 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6530 if (!u)
6531 return NULL;
6532 Py_UNICODE_COPY(u->str, self->str, self->length);
6533 u1 = str1->str[0];
6534 u2 = str2->str[0];
6535 for (i = 0; i < u->length; i++)
6536 if (u->str[i] == u1) {
6537 if (--maxcount < 0)
6538 break;
6539 u->str[i] = u2;
6540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 i = fastsearch(
6543 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 if (i < 0)
6546 goto nothing;
6547 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6548 if (!u)
6549 return NULL;
6550 Py_UNICODE_COPY(u->str, self->str, self->length);
6551 while (i <= self->length - str1->length)
6552 if (Py_UNICODE_MATCH(self, i, str1)) {
6553 if (--maxcount < 0)
6554 break;
6555 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6556 i += str1->length;
6557 } else
6558 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006561
6562 Py_ssize_t n, i, j, e;
6563 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_UNICODE *p;
6565
6566 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 if (n > maxcount)
6569 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 if (n == 0)
6571 goto nothing;
6572 /* new_size = self->length + n * (str2->length - str1->length)); */
6573 delta = (str2->length - str1->length);
6574 if (delta == 0) {
6575 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006577 product = n * (str2->length - str1->length);
6578 if ((product / (str2->length - str1->length)) != n) {
6579 PyErr_SetString(PyExc_OverflowError,
6580 "replace string is too long");
6581 return NULL;
6582 }
6583 new_size = self->length + product;
6584 if (new_size < 0) {
6585 PyErr_SetString(PyExc_OverflowError,
6586 "replace string is too long");
6587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 }
6589 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590 u = _PyUnicode_New(new_size);
6591 if (!u)
6592 return NULL;
6593 i = 0;
6594 p = u->str;
6595 e = self->length - str1->length;
6596 if (str1->length > 0) {
6597 while (n-- > 0) {
6598 /* look for next match */
6599 j = i;
6600 while (j <= e) {
6601 if (Py_UNICODE_MATCH(self, j, str1))
6602 break;
6603 j++;
6604 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006606 if (j > e)
6607 break;
6608 /* copy unchanged part [i:j] */
6609 Py_UNICODE_COPY(p, self->str+i, j-i);
6610 p += j - i;
6611 }
6612 /* copy substitution string */
6613 if (str2->length > 0) {
6614 Py_UNICODE_COPY(p, str2->str, str2->length);
6615 p += str2->length;
6616 }
6617 i = j + str1->length;
6618 }
6619 if (i < self->length)
6620 /* copy tail [i:] */
6621 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6622 } else {
6623 /* interleave */
6624 while (n > 0) {
6625 Py_UNICODE_COPY(p, str2->str, str2->length);
6626 p += str2->length;
6627 if (--n <= 0)
6628 break;
6629 *p++ = self->str[i++];
6630 }
6631 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006637 /* nothing to replace; return original string (when possible) */
6638 if (PyUnicode_CheckExact(self)) {
6639 Py_INCREF(self);
6640 return (PyObject *) self;
6641 }
6642 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
6645/* --- Unicode Object Methods --------------------------------------------- */
6646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649\n\
6650Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006651characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
6653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006654unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 return fixup(self, fixtitle);
6657}
6658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006659PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661\n\
6662Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
6665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006666unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 return fixup(self, fixcapitalize);
6669}
6670
6671#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674\n\
6675Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006679unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
6681 PyObject *list;
6682 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006683 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 /* Split into words */
6686 list = split(self, NULL, -1);
6687 if (!list)
6688 return NULL;
6689
6690 /* Capitalize each word */
6691 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6692 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 if (item == NULL)
6695 goto onError;
6696 Py_DECREF(PyList_GET_ITEM(list, i));
6697 PyList_SET_ITEM(list, i, item);
6698 }
6699
6700 /* Join the words to form a new string */
6701 item = PyUnicode_Join(NULL, list);
6702
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 Py_DECREF(list);
6705 return (PyObject *)item;
6706}
6707#endif
6708
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006709/* Argument converter. Coerces to a single unicode character */
6710
6711static int
6712convert_uc(PyObject *obj, void *addr)
6713{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006714 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6715 PyObject *uniobj;
6716 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006717
Benjamin Peterson14339b62009-01-31 16:36:08 +00006718 uniobj = PyUnicode_FromObject(obj);
6719 if (uniobj == NULL) {
6720 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006722 return 0;
6723 }
6724 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6725 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006727 Py_DECREF(uniobj);
6728 return 0;
6729 }
6730 unistr = PyUnicode_AS_UNICODE(uniobj);
6731 *fillcharloc = unistr[0];
6732 Py_DECREF(uniobj);
6733 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006734}
6735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006736PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006739Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006740done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741
6742static PyObject *
6743unicode_center(PyUnicodeObject *self, PyObject *args)
6744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t marg, left;
6746 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006747 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
Thomas Woutersde017742006-02-16 19:34:37 +00006749 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751
Tim Peters7a29bd52001-09-12 03:03:31 +00006752 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 Py_INCREF(self);
6754 return (PyObject*) self;
6755 }
6756
6757 marg = width - self->length;
6758 left = marg / 2 + (marg & width & 1);
6759
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006760 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761}
6762
Marc-André Lemburge5034372000-08-08 08:04:29 +00006763#if 0
6764
6765/* This code should go into some future Unicode collation support
6766 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006767 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006768
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006769/* speedy UTF-16 code point order comparison */
6770/* gleaned from: */
6771/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6772
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006773static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006774{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006775 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006776 0, 0, 0, 0, 0, 0, 0, 0,
6777 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006778 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006779};
6780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781static int
6782unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6783{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006784 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 Py_UNICODE *s1 = str1->str;
6787 Py_UNICODE *s2 = str2->str;
6788
6789 len1 = str1->length;
6790 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006793 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006794
6795 c1 = *s1++;
6796 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 if (c1 > (1<<11) * 26)
6799 c1 += utf16Fixup[c1>>11];
6800 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006801 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006802 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006803
6804 if (c1 != c2)
6805 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006806
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006807 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 }
6809
6810 return (len1 < len2) ? -1 : (len1 != len2);
6811}
6812
Marc-André Lemburge5034372000-08-08 08:04:29 +00006813#else
6814
6815static int
6816unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006818 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006819
6820 Py_UNICODE *s1 = str1->str;
6821 Py_UNICODE *s2 = str2->str;
6822
6823 len1 = str1->length;
6824 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006825
Marc-André Lemburge5034372000-08-08 08:04:29 +00006826 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006827 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006828
Fredrik Lundh45714e92001-06-26 16:39:36 +00006829 c1 = *s1++;
6830 c2 = *s2++;
6831
6832 if (c1 != c2)
6833 return (c1 < c2) ? -1 : 1;
6834
Marc-André Lemburge5034372000-08-08 08:04:29 +00006835 len1--; len2--;
6836 }
6837
6838 return (len1 < len2) ? -1 : (len1 != len2);
6839}
6840
6841#endif
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006846 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6847 return unicode_compare((PyUnicodeObject *)left,
6848 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006849 PyErr_Format(PyExc_TypeError,
6850 "Can't compare %.100s and %.100s",
6851 left->ob_type->tp_name,
6852 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return -1;
6854}
6855
Martin v. Löwis5b222132007-06-10 09:51:05 +00006856int
6857PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6858{
6859 int i;
6860 Py_UNICODE *id;
6861 assert(PyUnicode_Check(uni));
6862 id = PyUnicode_AS_UNICODE(uni);
6863 /* Compare Unicode string and source character set string */
6864 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 if (id[i] != str[i])
6866 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006867 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006869 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006871 return 0;
6872}
6873
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006874
Benjamin Peterson29060642009-01-31 22:14:21 +00006875#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006876 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006877
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006878PyObject *PyUnicode_RichCompare(PyObject *left,
6879 PyObject *right,
6880 int op)
6881{
6882 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006884 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6885 PyObject *v;
6886 if (((PyUnicodeObject *) left)->length !=
6887 ((PyUnicodeObject *) right)->length) {
6888 if (op == Py_EQ) {
6889 Py_INCREF(Py_False);
6890 return Py_False;
6891 }
6892 if (op == Py_NE) {
6893 Py_INCREF(Py_True);
6894 return Py_True;
6895 }
6896 }
6897 if (left == right)
6898 result = 0;
6899 else
6900 result = unicode_compare((PyUnicodeObject *)left,
6901 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006902
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006903 /* Convert the return value to a Boolean */
6904 switch (op) {
6905 case Py_EQ:
6906 v = TEST_COND(result == 0);
6907 break;
6908 case Py_NE:
6909 v = TEST_COND(result != 0);
6910 break;
6911 case Py_LE:
6912 v = TEST_COND(result <= 0);
6913 break;
6914 case Py_GE:
6915 v = TEST_COND(result >= 0);
6916 break;
6917 case Py_LT:
6918 v = TEST_COND(result == -1);
6919 break;
6920 case Py_GT:
6921 v = TEST_COND(result == 1);
6922 break;
6923 default:
6924 PyErr_BadArgument();
6925 return NULL;
6926 }
6927 Py_INCREF(v);
6928 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006929 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006930
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006931 Py_INCREF(Py_NotImplemented);
6932 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006933}
6934
Guido van Rossum403d68b2000-03-13 15:55:09 +00006935int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006937{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006938 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006940
6941 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942 sub = PyUnicode_FromObject(element);
6943 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 PyErr_Format(PyExc_TypeError,
6945 "'in <string>' requires string as left operand, not %s",
6946 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006947 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006948 }
6949
Thomas Wouters477c8d52006-05-27 19:21:47 +00006950 str = PyUnicode_FromObject(container);
6951 if (!str) {
6952 Py_DECREF(sub);
6953 return -1;
6954 }
6955
6956 result = stringlib_contains_obj(str, sub);
6957
6958 Py_DECREF(str);
6959 Py_DECREF(sub);
6960
Guido van Rossum403d68b2000-03-13 15:55:09 +00006961 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006962}
6963
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964/* Concat to string or Unicode object giving a new Unicode object. */
6965
6966PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
6969 PyUnicodeObject *u = NULL, *v = NULL, *w;
6970
6971 /* Coerce the two arguments */
6972 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6973 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6976 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979 /* Shortcuts */
6980 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 Py_DECREF(v);
6982 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 }
6984 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 Py_DECREF(u);
6986 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
6988
6989 /* Concat the two Unicode strings */
6990 w = _PyUnicode_New(u->length + v->length);
6991 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 Py_UNICODE_COPY(w->str, u->str, u->length);
6994 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6995
6996 Py_DECREF(u);
6997 Py_DECREF(v);
6998 return (PyObject *)w;
6999
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 Py_XDECREF(u);
7002 Py_XDECREF(v);
7003 return NULL;
7004}
7005
Walter Dörwald1ab83302007-05-18 17:15:44 +00007006void
7007PyUnicode_Append(PyObject **pleft, PyObject *right)
7008{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007009 PyObject *new;
7010 if (*pleft == NULL)
7011 return;
7012 if (right == NULL || !PyUnicode_Check(*pleft)) {
7013 Py_DECREF(*pleft);
7014 *pleft = NULL;
7015 return;
7016 }
7017 new = PyUnicode_Concat(*pleft, right);
7018 Py_DECREF(*pleft);
7019 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007020}
7021
7022void
7023PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7024{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007025 PyUnicode_Append(pleft, right);
7026 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007027}
7028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007032Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007033string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036static PyObject *
7037unicode_count(PyUnicodeObject *self, PyObject *args)
7038{
7039 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007040 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007041 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 PyObject *result;
7043
Guido van Rossumb8872e62000-05-09 14:14:27 +00007044 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 return NULL;
7047
7048 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007052
Thomas Wouters477c8d52006-05-27 19:21:47 +00007053 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Christian Heimes217cfd12007-12-02 14:31:20 +00007055 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 stringlib_count(self->str + start, end - start,
7057 substring->str, substring->length)
7058 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007061
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 return result;
7063}
7064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007065PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007068Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007069to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007070handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7072'xmlcharrefreplace' as well as any other name registered with\n\
7073codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075static PyObject *
7076unicode_encode(PyUnicodeObject *self, PyObject *args)
7077{
7078 char *encoding = NULL;
7079 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007080 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007081
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7083 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007084 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007085 if (v == NULL)
7086 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007087 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007088 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007089 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007090 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007091 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007092 Py_DECREF(v);
7093 return NULL;
7094 }
7095 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007096
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007098 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007099}
7100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007101PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103\n\
7104Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007105If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
7107static PyObject*
7108unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7109{
7110 Py_UNICODE *e;
7111 Py_UNICODE *p;
7112 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007113 Py_UNICODE *qe;
7114 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 PyUnicodeObject *u;
7116 int tabsize = 8;
7117
7118 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
Thomas Wouters7e474022000-07-16 12:04:32 +00007121 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007122 i = 0; /* chars up to and including most recent \n or \r */
7123 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7124 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 for (p = self->str; p < e; p++)
7126 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 if (tabsize > 0) {
7128 incr = tabsize - (j % tabsize); /* cannot overflow */
7129 if (j > PY_SSIZE_T_MAX - incr)
7130 goto overflow1;
7131 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007132 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 if (j > PY_SSIZE_T_MAX - 1)
7136 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 j++;
7138 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 if (i > PY_SSIZE_T_MAX - j)
7140 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007142 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 }
7144 }
7145
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007146 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 /* Second pass: create output string and fill it */
7150 u = _PyUnicode_New(i + j);
7151 if (!u)
7152 return NULL;
7153
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007154 j = 0; /* same as in first pass */
7155 q = u->str; /* next output char */
7156 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158 for (p = self->str; p < e; p++)
7159 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 if (tabsize > 0) {
7161 i = tabsize - (j % tabsize);
7162 j += i;
7163 while (i--) {
7164 if (q >= qe)
7165 goto overflow2;
7166 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007169 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 else {
7171 if (q >= qe)
7172 goto overflow2;
7173 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007174 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 if (*p == '\n' || *p == '\r')
7176 j = 0;
7177 }
7178
7179 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007180
7181 overflow2:
7182 Py_DECREF(u);
7183 overflow1:
7184 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186}
7187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007188PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190\n\
7191Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007192such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193arguments start and end are interpreted as in slice notation.\n\
7194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007195Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject *
7198unicode_find(PyUnicodeObject *self, PyObject *args)
7199{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007200 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007201 Py_ssize_t start;
7202 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204
Christian Heimes9cd17752007-11-18 19:35:23 +00007205 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207
Thomas Wouters477c8d52006-05-27 19:21:47 +00007208 result = stringlib_find_slice(
7209 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7210 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7211 start, end
7212 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007215
Christian Heimes217cfd12007-12-02 14:31:20 +00007216 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217}
7218
7219static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221{
7222 if (index < 0 || index >= self->length) {
7223 PyErr_SetString(PyExc_IndexError, "string index out of range");
7224 return NULL;
7225 }
7226
7227 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7228}
7229
Guido van Rossumc2504932007-09-18 19:42:40 +00007230/* Believe it or not, this produces the same value for ASCII strings
7231 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007233unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234{
Guido van Rossumc2504932007-09-18 19:42:40 +00007235 Py_ssize_t len;
7236 Py_UNICODE *p;
7237 long x;
7238
7239 if (self->hash != -1)
7240 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007241 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007242 p = self->str;
7243 x = *p << 7;
7244 while (--len >= 0)
7245 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007246 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007247 if (x == -1)
7248 x = -2;
7249 self->hash = x;
7250 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251}
7252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007256Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257
7258static PyObject *
7259unicode_index(PyUnicodeObject *self, PyObject *args)
7260{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007262 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007263 Py_ssize_t start;
7264 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
Christian Heimes9cd17752007-11-18 19:35:23 +00007266 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
Thomas Wouters477c8d52006-05-27 19:21:47 +00007269 result = stringlib_find_slice(
7270 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7271 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7272 start, end
7273 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
7275 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007276
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 if (result < 0) {
7278 PyErr_SetString(PyExc_ValueError, "substring not found");
7279 return NULL;
7280 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007281
Christian Heimes217cfd12007-12-02 14:31:20 +00007282 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283}
7284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007285PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007288Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
7291static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007292unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293{
7294 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7295 register const Py_UNICODE *e;
7296 int cased;
7297
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 /* Shortcut for single character strings */
7299 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007302 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007303 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007305
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 e = p + PyUnicode_GET_SIZE(self);
7307 cased = 0;
7308 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007310
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7312 return PyBool_FromLong(0);
7313 else if (!cased && Py_UNICODE_ISLOWER(ch))
7314 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007316 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317}
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007322Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
7325static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007326unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327{
7328 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7329 register const Py_UNICODE *e;
7330 int cased;
7331
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 /* Shortcut for single character strings */
7333 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007336 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007337 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007339
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 e = p + PyUnicode_GET_SIZE(self);
7341 cased = 0;
7342 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007344
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7346 return PyBool_FromLong(0);
7347 else if (!cased && Py_UNICODE_ISUPPER(ch))
7348 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007350 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351}
7352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007353PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007356Return True if S is a titlecased string and there is at least one\n\
7357character in S, i.e. upper- and titlecase characters may only\n\
7358follow uncased characters and lowercase characters only cased ones.\n\
7359Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360
7361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007362unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363{
7364 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7365 register const Py_UNICODE *e;
7366 int cased, previous_is_cased;
7367
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 /* Shortcut for single character strings */
7369 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7371 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007373 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007374 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 e = p + PyUnicode_GET_SIZE(self);
7378 cased = 0;
7379 previous_is_cased = 0;
7380 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007382
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7384 if (previous_is_cased)
7385 return PyBool_FromLong(0);
7386 previous_is_cased = 1;
7387 cased = 1;
7388 }
7389 else if (Py_UNICODE_ISLOWER(ch)) {
7390 if (!previous_is_cased)
7391 return PyBool_FromLong(0);
7392 previous_is_cased = 1;
7393 cased = 1;
7394 }
7395 else
7396 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007398 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399}
7400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007401PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007404Return True if all characters in S are whitespace\n\
7405and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
7407static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007408unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409{
7410 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7411 register const Py_UNICODE *e;
7412
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 /* Shortcut for single character strings */
7414 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 Py_UNICODE_ISSPACE(*p))
7416 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007418 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007419 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007421
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 e = p + PyUnicode_GET_SIZE(self);
7423 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 if (!Py_UNICODE_ISSPACE(*p))
7425 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007427 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007432\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007433Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007434and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007435
7436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007437unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007438{
7439 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7440 register const Py_UNICODE *e;
7441
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007442 /* Shortcut for single character strings */
7443 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 Py_UNICODE_ISALPHA(*p))
7445 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007446
7447 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007448 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007450
7451 e = p + PyUnicode_GET_SIZE(self);
7452 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 if (!Py_UNICODE_ISALPHA(*p))
7454 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007455 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007456 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007457}
7458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007459PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007461\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007462Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007463and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007464
7465static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007466unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007467{
7468 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7469 register const Py_UNICODE *e;
7470
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007471 /* Shortcut for single character strings */
7472 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 Py_UNICODE_ISALNUM(*p))
7474 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007475
7476 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007477 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007479
7480 e = p + PyUnicode_GET_SIZE(self);
7481 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 if (!Py_UNICODE_ISALNUM(*p))
7483 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007484 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007485 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007486}
7487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007491Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493
7494static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007495unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496{
7497 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7498 register const Py_UNICODE *e;
7499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 /* Shortcut for single character strings */
7501 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 Py_UNICODE_ISDECIMAL(*p))
7503 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007505 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007506 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 e = p + PyUnicode_GET_SIZE(self);
7510 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 if (!Py_UNICODE_ISDECIMAL(*p))
7512 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007514 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515}
7516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007517PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007520Return True if all characters in S are digits\n\
7521and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007524unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525{
7526 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7527 register const Py_UNICODE *e;
7528
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 /* Shortcut for single character strings */
7530 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 Py_UNICODE_ISDIGIT(*p))
7532 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007534 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007535 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007537
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 e = p + PyUnicode_GET_SIZE(self);
7539 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 if (!Py_UNICODE_ISDIGIT(*p))
7541 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007543 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544}
7545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007546PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007549Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007550False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
7552static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007553unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554{
7555 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7556 register const Py_UNICODE *e;
7557
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 /* Shortcut for single character strings */
7559 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 Py_UNICODE_ISNUMERIC(*p))
7561 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007563 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007564 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007566
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 e = p + PyUnicode_GET_SIZE(self);
7568 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 if (!Py_UNICODE_ISNUMERIC(*p))
7570 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007572 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573}
7574
Martin v. Löwis47383402007-08-15 07:32:56 +00007575int
7576PyUnicode_IsIdentifier(PyObject *self)
7577{
7578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7579 register const Py_UNICODE *e;
7580
7581 /* Special case for empty strings */
7582 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007584
7585 /* PEP 3131 says that the first character must be in
7586 XID_Start and subsequent characters in XID_Continue,
7587 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007588 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007589 letters, digits, underscore). However, given the current
7590 definition of XID_Start and XID_Continue, it is sufficient
7591 to check just for these, except that _ must be allowed
7592 as starting an identifier. */
7593 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7594 return 0;
7595
7596 e = p + PyUnicode_GET_SIZE(self);
7597 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (!_PyUnicode_IsXidContinue(*p))
7599 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007600 }
7601 return 1;
7602}
7603
7604PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007606\n\
7607Return True if S is a valid identifier according\n\
7608to the language definition.");
7609
7610static PyObject*
7611unicode_isidentifier(PyObject *self)
7612{
7613 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7614}
7615
Georg Brandl559e5d72008-06-11 18:37:52 +00007616PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007618\n\
7619Return True if all characters in S are considered\n\
7620printable in repr() or S is empty, False otherwise.");
7621
7622static PyObject*
7623unicode_isprintable(PyObject *self)
7624{
7625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7626 register const Py_UNICODE *e;
7627
7628 /* Shortcut for single character strings */
7629 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7630 Py_RETURN_TRUE;
7631 }
7632
7633 e = p + PyUnicode_GET_SIZE(self);
7634 for (; p < e; p++) {
7635 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7636 Py_RETURN_FALSE;
7637 }
7638 }
7639 Py_RETURN_TRUE;
7640}
7641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007642PyDoc_STRVAR(join__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 "S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644\n\
7645Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647
7648static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007649unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007651 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652}
7653
Martin v. Löwis18e16552006-02-15 17:27:45 +00007654static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655unicode_length(PyUnicodeObject *self)
7656{
7657 return self->length;
7658}
7659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007660PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007663Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007664done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
7666static PyObject *
7667unicode_ljust(PyUnicodeObject *self, PyObject *args)
7668{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007669 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007670 Py_UNICODE fillchar = ' ';
7671
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007672 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 return NULL;
7674
Tim Peters7a29bd52001-09-12 03:03:31 +00007675 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 Py_INCREF(self);
7677 return (PyObject*) self;
7678 }
7679
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007680 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681}
7682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007683PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007686Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687
7688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007689unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 return fixup(self, fixlower);
7692}
7693
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007694#define LEFTSTRIP 0
7695#define RIGHTSTRIP 1
7696#define BOTHSTRIP 2
7697
7698/* Arrays indexed by above */
7699static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7700
7701#define STRIPNAME(i) (stripformat[i]+3)
7702
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007703/* externally visible for str.strip(unicode) */
7704PyObject *
7705_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7706{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7708 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7709 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7710 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7711 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007712
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007714
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 i = 0;
7716 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7718 i++;
7719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 j = len;
7723 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 do {
7725 j--;
7726 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7727 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007728 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 Py_INCREF(self);
7732 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 }
7734 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007736}
7737
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
7739static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007740do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7743 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007744
Benjamin Peterson14339b62009-01-31 16:36:08 +00007745 i = 0;
7746 if (striptype != RIGHTSTRIP) {
7747 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7748 i++;
7749 }
7750 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 j = len;
7753 if (striptype != LEFTSTRIP) {
7754 do {
7755 j--;
7756 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7757 j++;
7758 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7761 Py_INCREF(self);
7762 return (PyObject*)self;
7763 }
7764 else
7765 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766}
7767
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007768
7769static PyObject *
7770do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7771{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007773
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7775 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007776
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 if (sep != NULL && sep != Py_None) {
7778 if (PyUnicode_Check(sep))
7779 return _PyUnicode_XStrip(self, striptype, sep);
7780 else {
7781 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 "%s arg must be None or str",
7783 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 return NULL;
7785 }
7786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007787
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007789}
7790
7791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007792PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007794\n\
7795Return a copy of the string S with leading and trailing\n\
7796whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007797If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007798
7799static PyObject *
7800unicode_strip(PyUnicodeObject *self, PyObject *args)
7801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 if (PyTuple_GET_SIZE(args) == 0)
7803 return do_strip(self, BOTHSTRIP); /* Common case */
7804 else
7805 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806}
7807
7808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007809PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811\n\
7812Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007813If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814
7815static PyObject *
7816unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7817{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 if (PyTuple_GET_SIZE(args) == 0)
7819 return do_strip(self, LEFTSTRIP); /* Common case */
7820 else
7821 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822}
7823
7824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007827\n\
7828Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007829If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007830
7831static PyObject *
7832unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 if (PyTuple_GET_SIZE(args) == 0)
7835 return do_strip(self, RIGHTSTRIP); /* Common case */
7836 else
7837 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838}
7839
7840
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007842unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843{
7844 PyUnicodeObject *u;
7845 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007846 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007847 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
Georg Brandl222de0f2009-04-12 12:01:50 +00007849 if (len < 1) {
7850 Py_INCREF(unicode_empty);
7851 return (PyObject *)unicode_empty;
7852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853
Tim Peters7a29bd52001-09-12 03:03:31 +00007854 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 /* no repeat, return original string */
7856 Py_INCREF(str);
7857 return (PyObject*) str;
7858 }
Tim Peters8f422462000-09-09 06:13:41 +00007859
7860 /* ensure # of chars needed doesn't overflow int and # of bytes
7861 * needed doesn't overflow size_t
7862 */
7863 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007864 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007865 PyErr_SetString(PyExc_OverflowError,
7866 "repeated string is too long");
7867 return NULL;
7868 }
7869 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7870 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7871 PyErr_SetString(PyExc_OverflowError,
7872 "repeated string is too long");
7873 return NULL;
7874 }
7875 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 if (!u)
7877 return NULL;
7878
7879 p = u->str;
7880
Georg Brandl222de0f2009-04-12 12:01:50 +00007881 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882 Py_UNICODE_FILL(p, str->str[0], len);
7883 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007884 Py_ssize_t done = str->length; /* number of characters copied this far */
7885 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007887 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007888 Py_UNICODE_COPY(p+done, p, n);
7889 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 }
7892
7893 return (PyObject*) u;
7894}
7895
7896PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 PyObject *subobj,
7898 PyObject *replobj,
7899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
7901 PyObject *self;
7902 PyObject *str1;
7903 PyObject *str2;
7904 PyObject *result;
7905
7906 self = PyUnicode_FromObject(obj);
7907 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 str1 = PyUnicode_FromObject(subobj);
7910 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 Py_DECREF(self);
7912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 }
7914 str2 = PyUnicode_FromObject(replobj);
7915 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_DECREF(self);
7917 Py_DECREF(str1);
7918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 }
Tim Petersced69f82003-09-16 20:30:58 +00007920 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 (PyUnicodeObject *)str1,
7922 (PyUnicodeObject *)str2,
7923 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 Py_DECREF(self);
7925 Py_DECREF(str1);
7926 Py_DECREF(str2);
7927 return result;
7928}
7929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007930PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932\n\
7933Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007934old replaced by new. If the optional argument count is\n\
7935given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936
7937static PyObject*
7938unicode_replace(PyUnicodeObject *self, PyObject *args)
7939{
7940 PyUnicodeObject *str1;
7941 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007942 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 PyObject *result;
7944
Martin v. Löwis18e16552006-02-15 17:27:45 +00007945 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 return NULL;
7947 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7948 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007951 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 Py_DECREF(str1);
7953 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955
7956 result = replace(self, str1, str2, maxcount);
7957
7958 Py_DECREF(str1);
7959 Py_DECREF(str2);
7960 return result;
7961}
7962
7963static
7964PyObject *unicode_repr(PyObject *unicode)
7965{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007966 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007967 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007968 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7969 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7970
7971 /* XXX(nnorwitz): rather than over-allocating, it would be
7972 better to choose a different scheme. Perhaps scan the
7973 first N-chars of the string and allocate based on that size.
7974 */
7975 /* Initial allocation is based on the longest-possible unichr
7976 escape.
7977
7978 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7979 unichr, so in this case it's the longest unichr escape. In
7980 narrow (UTF-16) builds this is five chars per source unichr
7981 since there are two unichrs in the surrogate pair, so in narrow
7982 (UTF-16) builds it's not the longest unichr escape.
7983
7984 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7985 so in the narrow (UTF-16) build case it's the longest unichr
7986 escape.
7987 */
7988
Walter Dörwald1ab83302007-05-18 17:15:44 +00007989 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007991#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007993#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007995#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007997 if (repr == NULL)
7998 return NULL;
7999
Walter Dörwald1ab83302007-05-18 17:15:44 +00008000 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008001
8002 /* Add quote */
8003 *p++ = (findchar(s, size, '\'') &&
8004 !findchar(s, size, '"')) ? '"' : '\'';
8005 while (size-- > 0) {
8006 Py_UNICODE ch = *s++;
8007
8008 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008009 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008010 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008011 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008012 continue;
8013 }
8014
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008016 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008017 *p++ = '\\';
8018 *p++ = 't';
8019 }
8020 else if (ch == '\n') {
8021 *p++ = '\\';
8022 *p++ = 'n';
8023 }
8024 else if (ch == '\r') {
8025 *p++ = '\\';
8026 *p++ = 'r';
8027 }
8028
8029 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008030 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008031 *p++ = '\\';
8032 *p++ = 'x';
8033 *p++ = hexdigits[(ch >> 4) & 0x000F];
8034 *p++ = hexdigits[ch & 0x000F];
8035 }
8036
Georg Brandl559e5d72008-06-11 18:37:52 +00008037 /* Copy ASCII characters as-is */
8038 else if (ch < 0x7F) {
8039 *p++ = ch;
8040 }
8041
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008043 else {
8044 Py_UCS4 ucs = ch;
8045
8046#ifndef Py_UNICODE_WIDE
8047 Py_UNICODE ch2 = 0;
8048 /* Get code point from surrogate pair */
8049 if (size > 0) {
8050 ch2 = *s;
8051 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008053 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008056 size--;
8057 }
8058 }
8059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008061 (categories Z* and C* except ASCII space)
8062 */
8063 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8064 /* Map 8-bit characters to '\xhh' */
8065 if (ucs <= 0xff) {
8066 *p++ = '\\';
8067 *p++ = 'x';
8068 *p++ = hexdigits[(ch >> 4) & 0x000F];
8069 *p++ = hexdigits[ch & 0x000F];
8070 }
8071 /* Map 21-bit characters to '\U00xxxxxx' */
8072 else if (ucs >= 0x10000) {
8073 *p++ = '\\';
8074 *p++ = 'U';
8075 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8076 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8077 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8078 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8079 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8080 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8081 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8082 *p++ = hexdigits[ucs & 0x0000000F];
8083 }
8084 /* Map 16-bit characters to '\uxxxx' */
8085 else {
8086 *p++ = '\\';
8087 *p++ = 'u';
8088 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8089 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8090 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8091 *p++ = hexdigits[ucs & 0x000F];
8092 }
8093 }
8094 /* Copy characters as-is */
8095 else {
8096 *p++ = ch;
8097#ifndef Py_UNICODE_WIDE
8098 if (ucs >= 0x10000)
8099 *p++ = ch2;
8100#endif
8101 }
8102 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008103 }
8104 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008105 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008106
8107 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008108 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008109 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110}
8111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008112PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114\n\
8115Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008116such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117arguments start and end are interpreted as in slice notation.\n\
8118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008119Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
8121static PyObject *
8122unicode_rfind(PyUnicodeObject *self, PyObject *args)
8123{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008124 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008125 Py_ssize_t start;
8126 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008127 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Christian Heimes9cd17752007-11-18 19:35:23 +00008129 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131
Thomas Wouters477c8d52006-05-27 19:21:47 +00008132 result = stringlib_rfind_slice(
8133 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8134 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8135 start, end
8136 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
8138 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008139
Christian Heimes217cfd12007-12-02 14:31:20 +00008140 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141}
8142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008143PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008146Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147
8148static PyObject *
8149unicode_rindex(PyUnicodeObject *self, PyObject *args)
8150{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008151 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008152 Py_ssize_t start;
8153 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008154 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155
Christian Heimes9cd17752007-11-18 19:35:23 +00008156 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158
Thomas Wouters477c8d52006-05-27 19:21:47 +00008159 result = stringlib_rfind_slice(
8160 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8161 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8162 start, end
8163 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
8165 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008166
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 if (result < 0) {
8168 PyErr_SetString(PyExc_ValueError, "substring not found");
8169 return NULL;
8170 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008171 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172}
8173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008177Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008178done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179
8180static PyObject *
8181unicode_rjust(PyUnicodeObject *self, PyObject *args)
8182{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008183 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008184 Py_UNICODE fillchar = ' ';
8185
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008186 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 return NULL;
8188
Tim Peters7a29bd52001-09-12 03:03:31 +00008189 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 Py_INCREF(self);
8191 return (PyObject*) self;
8192 }
8193
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008194 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195}
8196
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 PyObject *sep,
8199 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200{
8201 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008202
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 s = PyUnicode_FromObject(s);
8204 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 if (sep != NULL) {
8207 sep = PyUnicode_FromObject(sep);
8208 if (sep == NULL) {
8209 Py_DECREF(s);
8210 return NULL;
8211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 }
8213
8214 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8215
8216 Py_DECREF(s);
8217 Py_XDECREF(sep);
8218 return result;
8219}
8220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008221PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223\n\
8224Return a list of the words in S, using sep as the\n\
8225delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008226splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008227whitespace string is a separator and empty strings are\n\
8228removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229
8230static PyObject*
8231unicode_split(PyUnicodeObject *self, PyObject *args)
8232{
8233 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008234 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235
Martin v. Löwis18e16552006-02-15 17:27:45 +00008236 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 return NULL;
8238
8239 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245}
8246
Thomas Wouters477c8d52006-05-27 19:21:47 +00008247PyObject *
8248PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8249{
8250 PyObject* str_obj;
8251 PyObject* sep_obj;
8252 PyObject* out;
8253
8254 str_obj = PyUnicode_FromObject(str_in);
8255 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008257 sep_obj = PyUnicode_FromObject(sep_in);
8258 if (!sep_obj) {
8259 Py_DECREF(str_obj);
8260 return NULL;
8261 }
8262
8263 out = stringlib_partition(
8264 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8265 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8266 );
8267
8268 Py_DECREF(sep_obj);
8269 Py_DECREF(str_obj);
8270
8271 return out;
8272}
8273
8274
8275PyObject *
8276PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8277{
8278 PyObject* str_obj;
8279 PyObject* sep_obj;
8280 PyObject* out;
8281
8282 str_obj = PyUnicode_FromObject(str_in);
8283 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008285 sep_obj = PyUnicode_FromObject(sep_in);
8286 if (!sep_obj) {
8287 Py_DECREF(str_obj);
8288 return NULL;
8289 }
8290
8291 out = stringlib_rpartition(
8292 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8293 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8294 );
8295
8296 Py_DECREF(sep_obj);
8297 Py_DECREF(str_obj);
8298
8299 return out;
8300}
8301
8302PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008304\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008305Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008306the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008307found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008308
8309static PyObject*
8310unicode_partition(PyUnicodeObject *self, PyObject *separator)
8311{
8312 return PyUnicode_Partition((PyObject *)self, separator);
8313}
8314
8315PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008317\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008318Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008320separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321
8322static PyObject*
8323unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8324{
8325 return PyUnicode_RPartition((PyObject *)self, separator);
8326}
8327
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008328PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 PyObject *sep,
8330 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008331{
8332 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008334 s = PyUnicode_FromObject(s);
8335 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008336 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (sep != NULL) {
8338 sep = PyUnicode_FromObject(sep);
8339 if (sep == NULL) {
8340 Py_DECREF(s);
8341 return NULL;
8342 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008343 }
8344
8345 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8346
8347 Py_DECREF(s);
8348 Py_XDECREF(sep);
8349 return result;
8350}
8351
8352PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008354\n\
8355Return a list of the words in S, using sep as the\n\
8356delimiter string, starting at the end of the string and\n\
8357working to the front. If maxsplit is given, at most maxsplit\n\
8358splits are done. If sep is not specified, any whitespace string\n\
8359is a separator.");
8360
8361static PyObject*
8362unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8363{
8364 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008365 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008366
Martin v. Löwis18e16552006-02-15 17:27:45 +00008367 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008368 return NULL;
8369
8370 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008372 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008374 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008376}
8377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008378PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380\n\
8381Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008382Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008383is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384
8385static PyObject*
8386unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8387{
Guido van Rossum86662912000-04-11 15:38:46 +00008388 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389
Guido van Rossum86662912000-04-11 15:38:46 +00008390 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 return NULL;
8392
Guido van Rossum86662912000-04-11 15:38:46 +00008393 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394}
8395
8396static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008397PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398{
Walter Dörwald346737f2007-05-31 10:44:43 +00008399 if (PyUnicode_CheckExact(self)) {
8400 Py_INCREF(self);
8401 return self;
8402 } else
8403 /* Subtype -- return genuine unicode string with the same value. */
8404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8405 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406}
8407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008408PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410\n\
8411Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008412and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413
8414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008415unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 return fixup(self, fixswapcase);
8418}
8419
Georg Brandlceee0772007-11-27 23:48:05 +00008420PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008422\n\
8423Return a translation table usable for str.translate().\n\
8424If there is only one argument, it must be a dictionary mapping Unicode\n\
8425ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008426Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008427If there are two arguments, they must be strings of equal length, and\n\
8428in the resulting dictionary, each character in x will be mapped to the\n\
8429character at the same position in y. If there is a third argument, it\n\
8430must be a string, whose characters will be mapped to None in the result.");
8431
8432static PyObject*
8433unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8434{
8435 PyObject *x, *y = NULL, *z = NULL;
8436 PyObject *new = NULL, *key, *value;
8437 Py_ssize_t i = 0;
8438 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439
Georg Brandlceee0772007-11-27 23:48:05 +00008440 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8441 return NULL;
8442 new = PyDict_New();
8443 if (!new)
8444 return NULL;
8445 if (y != NULL) {
8446 /* x must be a string too, of equal length */
8447 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8448 if (!PyUnicode_Check(x)) {
8449 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8450 "be a string if there is a second argument");
8451 goto err;
8452 }
8453 if (PyUnicode_GET_SIZE(x) != ylen) {
8454 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8455 "arguments must have equal length");
8456 goto err;
8457 }
8458 /* create entries for translating chars in x to those in y */
8459 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008460 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8461 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008462 if (!key || !value)
8463 goto err;
8464 res = PyDict_SetItem(new, key, value);
8465 Py_DECREF(key);
8466 Py_DECREF(value);
8467 if (res < 0)
8468 goto err;
8469 }
8470 /* create entries for deleting chars in z */
8471 if (z != NULL) {
8472 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008473 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008474 if (!key)
8475 goto err;
8476 res = PyDict_SetItem(new, key, Py_None);
8477 Py_DECREF(key);
8478 if (res < 0)
8479 goto err;
8480 }
8481 }
8482 } else {
8483 /* x must be a dict */
8484 if (!PyDict_Check(x)) {
8485 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8486 "to maketrans it must be a dict");
8487 goto err;
8488 }
8489 /* copy entries into the new dict, converting string keys to int keys */
8490 while (PyDict_Next(x, &i, &key, &value)) {
8491 if (PyUnicode_Check(key)) {
8492 /* convert string keys to integer keys */
8493 PyObject *newkey;
8494 if (PyUnicode_GET_SIZE(key) != 1) {
8495 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8496 "table must be of length 1");
8497 goto err;
8498 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008499 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008500 if (!newkey)
8501 goto err;
8502 res = PyDict_SetItem(new, newkey, value);
8503 Py_DECREF(newkey);
8504 if (res < 0)
8505 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008506 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008507 /* just keep integer keys */
8508 if (PyDict_SetItem(new, key, value) < 0)
8509 goto err;
8510 } else {
8511 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8512 "be strings or integers");
8513 goto err;
8514 }
8515 }
8516 }
8517 return new;
8518 err:
8519 Py_DECREF(new);
8520 return NULL;
8521}
8522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008523PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525\n\
8526Return a copy of the string S, where all characters have been mapped\n\
8527through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008528Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008529Unmapped characters are left untouched. Characters mapped to None\n\
8530are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531
8532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008533unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534{
Georg Brandlceee0772007-11-27 23:48:05 +00008535 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536}
8537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008538PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008541Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
8543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008544unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 return fixup(self, fixupper);
8547}
8548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008549PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008552Pad a numeric string S with zeros on the left, to fill a field\n\
8553of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
8555static PyObject *
8556unicode_zfill(PyUnicodeObject *self, PyObject *args)
8557{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008558 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 PyUnicodeObject *u;
8560
Martin v. Löwis18e16552006-02-15 17:27:45 +00008561 Py_ssize_t width;
8562 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564
8565 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008566 if (PyUnicode_CheckExact(self)) {
8567 Py_INCREF(self);
8568 return (PyObject*) self;
8569 }
8570 else
8571 return PyUnicode_FromUnicode(
8572 PyUnicode_AS_UNICODE(self),
8573 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 }
8576
8577 fill = width - self->length;
8578
8579 u = pad(self, fill, 0, '0');
8580
Walter Dörwald068325e2002-04-15 13:36:47 +00008581 if (u == NULL)
8582 return NULL;
8583
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 if (u->str[fill] == '+' || u->str[fill] == '-') {
8585 /* move sign to beginning of string */
8586 u->str[0] = u->str[fill];
8587 u->str[fill] = '0';
8588 }
8589
8590 return (PyObject*) u;
8591}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
8593#if 0
8594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008595unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596{
Christian Heimes2202f872008-02-06 14:31:34 +00008597 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599#endif
8600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008601PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008604Return True if S starts with the specified prefix, False otherwise.\n\
8605With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008606With optional end, stop comparing S at that position.\n\
8607prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
8609static PyObject *
8610unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008613 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008615 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008616 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008617 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008619 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8621 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008622 if (PyTuple_Check(subobj)) {
8623 Py_ssize_t i;
8624 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8625 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008627 if (substring == NULL)
8628 return NULL;
8629 result = tailmatch(self, substring, start, end, -1);
8630 Py_DECREF(substring);
8631 if (result) {
8632 Py_RETURN_TRUE;
8633 }
8634 }
8635 /* nothing matched */
8636 Py_RETURN_FALSE;
8637 }
8638 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008641 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008643 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644}
8645
8646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008647PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008650Return True if S ends with the specified suffix, False otherwise.\n\
8651With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008652With optional end, stop comparing S at that position.\n\
8653suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
8655static PyObject *
8656unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008659 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008661 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008662 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008663 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008665 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8667 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008668 if (PyTuple_Check(subobj)) {
8669 Py_ssize_t i;
8670 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8671 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008673 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008675 result = tailmatch(self, substring, start, end, +1);
8676 Py_DECREF(substring);
8677 if (result) {
8678 Py_RETURN_TRUE;
8679 }
8680 }
8681 Py_RETURN_FALSE;
8682 }
8683 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008687 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008689 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690}
8691
Eric Smith8c663262007-08-25 02:26:07 +00008692#include "stringlib/string_format.h"
8693
8694PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008696\n\
8697");
8698
Eric Smith4a7d76d2008-05-30 18:10:19 +00008699static PyObject *
8700unicode__format__(PyObject* self, PyObject* args)
8701{
8702 PyObject *format_spec;
8703
8704 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8705 return NULL;
8706
8707 return _PyUnicode_FormatAdvanced(self,
8708 PyUnicode_AS_UNICODE(format_spec),
8709 PyUnicode_GET_SIZE(format_spec));
8710}
8711
Eric Smith8c663262007-08-25 02:26:07 +00008712PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008714\n\
8715");
8716
8717static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008718unicode__sizeof__(PyUnicodeObject *v)
8719{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008720 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8721 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008722}
8723
8724PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008726
8727static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008728unicode_getnewargs(PyUnicodeObject *v)
8729{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008731}
8732
8733
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734static PyMethodDef unicode_methods[] = {
8735
8736 /* Order is according to common usage: often used methods should
8737 appear first, since lookup is done sequentially. */
8738
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008739 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8740 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8741 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008742 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008743 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8744 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8745 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8746 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8747 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8748 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8749 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008750 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008751 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8752 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8753 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008754 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008755 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8756 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8757 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008758 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008759 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008760 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008761 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008762 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8763 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8764 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8765 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8766 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8767 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8768 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8769 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8770 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8771 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8772 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8773 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8774 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8775 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008776 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008777 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008778 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008779 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008780 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008781 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8782 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008783 {"maketrans", (PyCFunction) unicode_maketrans,
8784 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008785 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008786#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008787 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788#endif
8789
8790#if 0
8791 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008792 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793#endif
8794
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 {NULL, NULL}
8797};
8798
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008799static PyObject *
8800unicode_mod(PyObject *v, PyObject *w)
8801{
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 if (!PyUnicode_Check(v)) {
8803 Py_INCREF(Py_NotImplemented);
8804 return Py_NotImplemented;
8805 }
8806 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008807}
8808
8809static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810 0, /*nb_add*/
8811 0, /*nb_subtract*/
8812 0, /*nb_multiply*/
8813 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008814};
8815
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 (lenfunc) unicode_length, /* sq_length */
8818 PyUnicode_Concat, /* sq_concat */
8819 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8820 (ssizeargfunc) unicode_getitem, /* sq_item */
8821 0, /* sq_slice */
8822 0, /* sq_ass_item */
8823 0, /* sq_ass_slice */
8824 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825};
8826
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008827static PyObject*
8828unicode_subscript(PyUnicodeObject* self, PyObject* item)
8829{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008830 if (PyIndex_Check(item)) {
8831 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008832 if (i == -1 && PyErr_Occurred())
8833 return NULL;
8834 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008835 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008836 return unicode_getitem(self, i);
8837 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008838 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008839 Py_UNICODE* source_buf;
8840 Py_UNICODE* result_buf;
8841 PyObject* result;
8842
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008843 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008845 return NULL;
8846 }
8847
8848 if (slicelength <= 0) {
8849 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008850 } else if (start == 0 && step == 1 && slicelength == self->length &&
8851 PyUnicode_CheckExact(self)) {
8852 Py_INCREF(self);
8853 return (PyObject *)self;
8854 } else if (step == 1) {
8855 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008856 } else {
8857 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008858 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8859 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 if (result_buf == NULL)
8862 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008863
8864 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8865 result_buf[i] = source_buf[cur];
8866 }
Tim Petersced69f82003-09-16 20:30:58 +00008867
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008868 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008869 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008870 return result;
8871 }
8872 } else {
8873 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8874 return NULL;
8875 }
8876}
8877
8878static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008879 (lenfunc)unicode_length, /* mp_length */
8880 (binaryfunc)unicode_subscript, /* mp_subscript */
8881 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008882};
8883
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885/* Helpers for PyUnicode_Format() */
8886
8887static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008888getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008890 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 (*p_argidx)++;
8893 if (arglen < 0)
8894 return args;
8895 else
8896 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 }
8898 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 return NULL;
8901}
8902
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008903/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008905static PyObject *
8906formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008908 char *p;
8909 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008911
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 x = PyFloat_AsDouble(v);
8913 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008914 return NULL;
8915
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008918
Eric Smith0923d1d2009-04-16 20:16:10 +00008919 p = PyOS_double_to_string(x, type, prec,
8920 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008921 if (p == NULL)
8922 return NULL;
8923 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008924 PyMem_Free(p);
8925 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926}
8927
Tim Peters38fd5b62000-09-21 05:43:11 +00008928static PyObject*
8929formatlong(PyObject *val, int flags, int prec, int type)
8930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008931 char *buf;
8932 int len;
8933 PyObject *str; /* temporary string object. */
8934 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008935
Benjamin Peterson14339b62009-01-31 16:36:08 +00008936 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8937 if (!str)
8938 return NULL;
8939 result = PyUnicode_FromStringAndSize(buf, len);
8940 Py_DECREF(str);
8941 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008942}
8943
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944static int
8945formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008946 size_t buflen,
8947 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008949 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008950 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 if (PyUnicode_GET_SIZE(v) == 1) {
8952 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8953 buf[1] = '\0';
8954 return 1;
8955 }
8956#ifndef Py_UNICODE_WIDE
8957 if (PyUnicode_GET_SIZE(v) == 2) {
8958 /* Decode a valid surrogate pair */
8959 int c0 = PyUnicode_AS_UNICODE(v)[0];
8960 int c1 = PyUnicode_AS_UNICODE(v)[1];
8961 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8962 0xDC00 <= c1 && c1 <= 0xDFFF) {
8963 buf[0] = c0;
8964 buf[1] = c1;
8965 buf[2] = '\0';
8966 return 2;
8967 }
8968 }
8969#endif
8970 goto onError;
8971 }
8972 else {
8973 /* Integer input truncated to a character */
8974 long x;
8975 x = PyLong_AsLong(v);
8976 if (x == -1 && PyErr_Occurred())
8977 goto onError;
8978
8979 if (x < 0 || x > 0x10ffff) {
8980 PyErr_SetString(PyExc_OverflowError,
8981 "%c arg not in range(0x110000)");
8982 return -1;
8983 }
8984
8985#ifndef Py_UNICODE_WIDE
8986 if (x > 0xffff) {
8987 x -= 0x10000;
8988 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8989 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8990 return 2;
8991 }
8992#endif
8993 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008994 buf[1] = '\0';
8995 return 1;
8996 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008999 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009001 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002}
9003
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009004/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009005 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009006*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009007#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009008
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011{
9012 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009013 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 int args_owned = 0;
9015 PyUnicodeObject *result = NULL;
9016 PyObject *dict = NULL;
9017 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009018
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 PyErr_BadInternalCall();
9021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 }
9023 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009024 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 fmt = PyUnicode_AS_UNICODE(uformat);
9027 fmtcnt = PyUnicode_GET_SIZE(uformat);
9028
9029 reslen = rescnt = fmtcnt + 100;
9030 result = _PyUnicode_New(reslen);
9031 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 res = PyUnicode_AS_UNICODE(result);
9034
9035 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 arglen = PyTuple_Size(args);
9037 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 }
9039 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 arglen = -1;
9041 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009043 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009044 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046
9047 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 if (*fmt != '%') {
9049 if (--rescnt < 0) {
9050 rescnt = fmtcnt + 100;
9051 reslen += rescnt;
9052 if (_PyUnicode_Resize(&result, reslen) < 0)
9053 goto onError;
9054 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9055 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 }
9059 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 /* Got a format specifier */
9061 int flags = 0;
9062 Py_ssize_t width = -1;
9063 int prec = -1;
9064 Py_UNICODE c = '\0';
9065 Py_UNICODE fill;
9066 int isnumok;
9067 PyObject *v = NULL;
9068 PyObject *temp = NULL;
9069 Py_UNICODE *pbuf;
9070 Py_UNICODE sign;
9071 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009072 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 fmt++;
9075 if (*fmt == '(') {
9076 Py_UNICODE *keystart;
9077 Py_ssize_t keylen;
9078 PyObject *key;
9079 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009080
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 if (dict == NULL) {
9082 PyErr_SetString(PyExc_TypeError,
9083 "format requires a mapping");
9084 goto onError;
9085 }
9086 ++fmt;
9087 --fmtcnt;
9088 keystart = fmt;
9089 /* Skip over balanced parentheses */
9090 while (pcount > 0 && --fmtcnt >= 0) {
9091 if (*fmt == ')')
9092 --pcount;
9093 else if (*fmt == '(')
9094 ++pcount;
9095 fmt++;
9096 }
9097 keylen = fmt - keystart - 1;
9098 if (fmtcnt < 0 || pcount > 0) {
9099 PyErr_SetString(PyExc_ValueError,
9100 "incomplete format key");
9101 goto onError;
9102 }
9103#if 0
9104 /* keys are converted to strings using UTF-8 and
9105 then looked up since Python uses strings to hold
9106 variables names etc. in its namespaces and we
9107 wouldn't want to break common idioms. */
9108 key = PyUnicode_EncodeUTF8(keystart,
9109 keylen,
9110 NULL);
9111#else
9112 key = PyUnicode_FromUnicode(keystart, keylen);
9113#endif
9114 if (key == NULL)
9115 goto onError;
9116 if (args_owned) {
9117 Py_DECREF(args);
9118 args_owned = 0;
9119 }
9120 args = PyObject_GetItem(dict, key);
9121 Py_DECREF(key);
9122 if (args == NULL) {
9123 goto onError;
9124 }
9125 args_owned = 1;
9126 arglen = -1;
9127 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 while (--fmtcnt >= 0) {
9130 switch (c = *fmt++) {
9131 case '-': flags |= F_LJUST; continue;
9132 case '+': flags |= F_SIGN; continue;
9133 case ' ': flags |= F_BLANK; continue;
9134 case '#': flags |= F_ALT; continue;
9135 case '0': flags |= F_ZERO; continue;
9136 }
9137 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 if (c == '*') {
9140 v = getnextarg(args, arglen, &argidx);
9141 if (v == NULL)
9142 goto onError;
9143 if (!PyLong_Check(v)) {
9144 PyErr_SetString(PyExc_TypeError,
9145 "* wants int");
9146 goto onError;
9147 }
9148 width = PyLong_AsLong(v);
9149 if (width == -1 && PyErr_Occurred())
9150 goto onError;
9151 if (width < 0) {
9152 flags |= F_LJUST;
9153 width = -width;
9154 }
9155 if (--fmtcnt >= 0)
9156 c = *fmt++;
9157 }
9158 else if (c >= '0' && c <= '9') {
9159 width = c - '0';
9160 while (--fmtcnt >= 0) {
9161 c = *fmt++;
9162 if (c < '0' || c > '9')
9163 break;
9164 if ((width*10) / 10 != width) {
9165 PyErr_SetString(PyExc_ValueError,
9166 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009167 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 }
9169 width = width*10 + (c - '0');
9170 }
9171 }
9172 if (c == '.') {
9173 prec = 0;
9174 if (--fmtcnt >= 0)
9175 c = *fmt++;
9176 if (c == '*') {
9177 v = getnextarg(args, arglen, &argidx);
9178 if (v == NULL)
9179 goto onError;
9180 if (!PyLong_Check(v)) {
9181 PyErr_SetString(PyExc_TypeError,
9182 "* wants int");
9183 goto onError;
9184 }
9185 prec = PyLong_AsLong(v);
9186 if (prec == -1 && PyErr_Occurred())
9187 goto onError;
9188 if (prec < 0)
9189 prec = 0;
9190 if (--fmtcnt >= 0)
9191 c = *fmt++;
9192 }
9193 else if (c >= '0' && c <= '9') {
9194 prec = c - '0';
9195 while (--fmtcnt >= 0) {
9196 c = Py_CHARMASK(*fmt++);
9197 if (c < '0' || c > '9')
9198 break;
9199 if ((prec*10) / 10 != prec) {
9200 PyErr_SetString(PyExc_ValueError,
9201 "prec too big");
9202 goto onError;
9203 }
9204 prec = prec*10 + (c - '0');
9205 }
9206 }
9207 } /* prec */
9208 if (fmtcnt >= 0) {
9209 if (c == 'h' || c == 'l' || c == 'L') {
9210 if (--fmtcnt >= 0)
9211 c = *fmt++;
9212 }
9213 }
9214 if (fmtcnt < 0) {
9215 PyErr_SetString(PyExc_ValueError,
9216 "incomplete format");
9217 goto onError;
9218 }
9219 if (c != '%') {
9220 v = getnextarg(args, arglen, &argidx);
9221 if (v == NULL)
9222 goto onError;
9223 }
9224 sign = 0;
9225 fill = ' ';
9226 switch (c) {
9227
9228 case '%':
9229 pbuf = formatbuf;
9230 /* presume that buffer length is at least 1 */
9231 pbuf[0] = '%';
9232 len = 1;
9233 break;
9234
9235 case 's':
9236 case 'r':
9237 case 'a':
9238 if (PyUnicode_Check(v) && c == 's') {
9239 temp = v;
9240 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009241 }
9242 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 if (c == 's')
9244 temp = PyObject_Str(v);
9245 else if (c == 'r')
9246 temp = PyObject_Repr(v);
9247 else
9248 temp = PyObject_ASCII(v);
9249 if (temp == NULL)
9250 goto onError;
9251 if (PyUnicode_Check(temp))
9252 /* nothing to do */;
9253 else {
9254 Py_DECREF(temp);
9255 PyErr_SetString(PyExc_TypeError,
9256 "%s argument has non-string str()");
9257 goto onError;
9258 }
9259 }
9260 pbuf = PyUnicode_AS_UNICODE(temp);
9261 len = PyUnicode_GET_SIZE(temp);
9262 if (prec >= 0 && len > prec)
9263 len = prec;
9264 break;
9265
9266 case 'i':
9267 case 'd':
9268 case 'u':
9269 case 'o':
9270 case 'x':
9271 case 'X':
9272 if (c == 'i')
9273 c = 'd';
9274 isnumok = 0;
9275 if (PyNumber_Check(v)) {
9276 PyObject *iobj=NULL;
9277
9278 if (PyLong_Check(v)) {
9279 iobj = v;
9280 Py_INCREF(iobj);
9281 }
9282 else {
9283 iobj = PyNumber_Long(v);
9284 }
9285 if (iobj!=NULL) {
9286 if (PyLong_Check(iobj)) {
9287 isnumok = 1;
9288 temp = formatlong(iobj, flags, prec, c);
9289 Py_DECREF(iobj);
9290 if (!temp)
9291 goto onError;
9292 pbuf = PyUnicode_AS_UNICODE(temp);
9293 len = PyUnicode_GET_SIZE(temp);
9294 sign = 1;
9295 }
9296 else {
9297 Py_DECREF(iobj);
9298 }
9299 }
9300 }
9301 if (!isnumok) {
9302 PyErr_Format(PyExc_TypeError,
9303 "%%%c format: a number is required, "
9304 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9305 goto onError;
9306 }
9307 if (flags & F_ZERO)
9308 fill = '0';
9309 break;
9310
9311 case 'e':
9312 case 'E':
9313 case 'f':
9314 case 'F':
9315 case 'g':
9316 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009317 temp = formatfloat(v, flags, prec, c);
9318 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009320 pbuf = PyUnicode_AS_UNICODE(temp);
9321 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 sign = 1;
9323 if (flags & F_ZERO)
9324 fill = '0';
9325 break;
9326
9327 case 'c':
9328 pbuf = formatbuf;
9329 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9330 if (len < 0)
9331 goto onError;
9332 break;
9333
9334 default:
9335 PyErr_Format(PyExc_ValueError,
9336 "unsupported format character '%c' (0x%x) "
9337 "at index %zd",
9338 (31<=c && c<=126) ? (char)c : '?',
9339 (int)c,
9340 (Py_ssize_t)(fmt - 1 -
9341 PyUnicode_AS_UNICODE(uformat)));
9342 goto onError;
9343 }
9344 if (sign) {
9345 if (*pbuf == '-' || *pbuf == '+') {
9346 sign = *pbuf++;
9347 len--;
9348 }
9349 else if (flags & F_SIGN)
9350 sign = '+';
9351 else if (flags & F_BLANK)
9352 sign = ' ';
9353 else
9354 sign = 0;
9355 }
9356 if (width < len)
9357 width = len;
9358 if (rescnt - (sign != 0) < width) {
9359 reslen -= rescnt;
9360 rescnt = width + fmtcnt + 100;
9361 reslen += rescnt;
9362 if (reslen < 0) {
9363 Py_XDECREF(temp);
9364 PyErr_NoMemory();
9365 goto onError;
9366 }
9367 if (_PyUnicode_Resize(&result, reslen) < 0) {
9368 Py_XDECREF(temp);
9369 goto onError;
9370 }
9371 res = PyUnicode_AS_UNICODE(result)
9372 + reslen - rescnt;
9373 }
9374 if (sign) {
9375 if (fill != ' ')
9376 *res++ = sign;
9377 rescnt--;
9378 if (width > len)
9379 width--;
9380 }
9381 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9382 assert(pbuf[0] == '0');
9383 assert(pbuf[1] == c);
9384 if (fill != ' ') {
9385 *res++ = *pbuf++;
9386 *res++ = *pbuf++;
9387 }
9388 rescnt -= 2;
9389 width -= 2;
9390 if (width < 0)
9391 width = 0;
9392 len -= 2;
9393 }
9394 if (width > len && !(flags & F_LJUST)) {
9395 do {
9396 --rescnt;
9397 *res++ = fill;
9398 } while (--width > len);
9399 }
9400 if (fill == ' ') {
9401 if (sign)
9402 *res++ = sign;
9403 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9404 assert(pbuf[0] == '0');
9405 assert(pbuf[1] == c);
9406 *res++ = *pbuf++;
9407 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009408 }
9409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 Py_UNICODE_COPY(res, pbuf, len);
9411 res += len;
9412 rescnt -= len;
9413 while (--width >= len) {
9414 --rescnt;
9415 *res++ = ' ';
9416 }
9417 if (dict && (argidx < arglen) && c != '%') {
9418 PyErr_SetString(PyExc_TypeError,
9419 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009420 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 goto onError;
9422 }
9423 Py_XDECREF(temp);
9424 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425 } /* until end */
9426 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 PyErr_SetString(PyExc_TypeError,
9428 "not all arguments converted during string formatting");
9429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430 }
9431
Thomas Woutersa96affe2006-03-12 00:29:36 +00009432 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436 }
9437 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 return (PyObject *)result;
9439
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441 Py_XDECREF(result);
9442 Py_DECREF(uformat);
9443 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445 }
9446 return NULL;
9447}
9448
Jeremy Hylton938ace62002-07-17 16:30:39 +00009449static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009450unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9451
Tim Peters6d6c1a32001-08-02 04:15:00 +00009452static PyObject *
9453unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9454{
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009456 static char *kwlist[] = {"object", "encoding", "errors", 0};
9457 char *encoding = NULL;
9458 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009459
Benjamin Peterson14339b62009-01-31 16:36:08 +00009460 if (type != &PyUnicode_Type)
9461 return unicode_subtype_new(type, args, kwds);
9462 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009464 return NULL;
9465 if (x == NULL)
9466 return (PyObject *)_PyUnicode_New(0);
9467 if (encoding == NULL && errors == NULL)
9468 return PyObject_Str(x);
9469 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009471}
9472
Guido van Rossume023fe02001-08-30 03:12:59 +00009473static PyObject *
9474unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9475{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009476 PyUnicodeObject *tmp, *pnew;
9477 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009478
Benjamin Peterson14339b62009-01-31 16:36:08 +00009479 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9480 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9481 if (tmp == NULL)
9482 return NULL;
9483 assert(PyUnicode_Check(tmp));
9484 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9485 if (pnew == NULL) {
9486 Py_DECREF(tmp);
9487 return NULL;
9488 }
9489 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9490 if (pnew->str == NULL) {
9491 _Py_ForgetReference((PyObject *)pnew);
9492 PyObject_Del(pnew);
9493 Py_DECREF(tmp);
9494 return PyErr_NoMemory();
9495 }
9496 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9497 pnew->length = n;
9498 pnew->hash = tmp->hash;
9499 Py_DECREF(tmp);
9500 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009501}
9502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009503PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009505\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009506Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009507encoding defaults to the current default string encoding.\n\
9508errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009509
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009510static PyObject *unicode_iter(PyObject *seq);
9511
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009513 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009514 "str", /* tp_name */
9515 sizeof(PyUnicodeObject), /* tp_size */
9516 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009518 (destructor)unicode_dealloc, /* tp_dealloc */
9519 0, /* tp_print */
9520 0, /* tp_getattr */
9521 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009522 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009523 unicode_repr, /* tp_repr */
9524 &unicode_as_number, /* tp_as_number */
9525 &unicode_as_sequence, /* tp_as_sequence */
9526 &unicode_as_mapping, /* tp_as_mapping */
9527 (hashfunc) unicode_hash, /* tp_hash*/
9528 0, /* tp_call*/
9529 (reprfunc) unicode_str, /* tp_str */
9530 PyObject_GenericGetAttr, /* tp_getattro */
9531 0, /* tp_setattro */
9532 0, /* tp_as_buffer */
9533 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009535 unicode_doc, /* tp_doc */
9536 0, /* tp_traverse */
9537 0, /* tp_clear */
9538 PyUnicode_RichCompare, /* tp_richcompare */
9539 0, /* tp_weaklistoffset */
9540 unicode_iter, /* tp_iter */
9541 0, /* tp_iternext */
9542 unicode_methods, /* tp_methods */
9543 0, /* tp_members */
9544 0, /* tp_getset */
9545 &PyBaseObject_Type, /* tp_base */
9546 0, /* tp_dict */
9547 0, /* tp_descr_get */
9548 0, /* tp_descr_set */
9549 0, /* tp_dictoffset */
9550 0, /* tp_init */
9551 0, /* tp_alloc */
9552 unicode_new, /* tp_new */
9553 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554};
9555
9556/* Initialize the Unicode implementation */
9557
Thomas Wouters78890102000-07-22 19:25:51 +00009558void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009560 int i;
9561
Thomas Wouters477c8d52006-05-27 19:21:47 +00009562 /* XXX - move this array to unicodectype.c ? */
9563 Py_UNICODE linebreak[] = {
9564 0x000A, /* LINE FEED */
9565 0x000D, /* CARRIAGE RETURN */
9566 0x001C, /* FILE SEPARATOR */
9567 0x001D, /* GROUP SEPARATOR */
9568 0x001E, /* RECORD SEPARATOR */
9569 0x0085, /* NEXT LINE */
9570 0x2028, /* LINE SEPARATOR */
9571 0x2029, /* PARAGRAPH SEPARATOR */
9572 };
9573
Fred Drakee4315f52000-05-09 19:53:39 +00009574 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009575 free_list = NULL;
9576 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009578 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009580
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009581 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009583 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009585
9586 /* initialize the linebreak bloom filter */
9587 bloom_linebreak = make_bloom_mask(
9588 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9589 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009590
9591 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592}
9593
9594/* Finalize the Unicode implementation */
9595
Christian Heimesa156e092008-02-16 07:38:31 +00009596int
9597PyUnicode_ClearFreeList(void)
9598{
9599 int freelist_size = numfree;
9600 PyUnicodeObject *u;
9601
9602 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 PyUnicodeObject *v = u;
9604 u = *(PyUnicodeObject **)u;
9605 if (v->str)
9606 PyObject_DEL(v->str);
9607 Py_XDECREF(v->defenc);
9608 PyObject_Del(v);
9609 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009610 }
9611 free_list = NULL;
9612 assert(numfree == 0);
9613 return freelist_size;
9614}
9615
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616void
Thomas Wouters78890102000-07-22 19:25:51 +00009617_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009619 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009621 Py_XDECREF(unicode_empty);
9622 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009623
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009624 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 if (unicode_latin1[i]) {
9626 Py_DECREF(unicode_latin1[i]);
9627 unicode_latin1[i] = NULL;
9628 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009629 }
Christian Heimesa156e092008-02-16 07:38:31 +00009630 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009632
Walter Dörwald16807132007-05-25 13:52:07 +00009633void
9634PyUnicode_InternInPlace(PyObject **p)
9635{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009636 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9637 PyObject *t;
9638 if (s == NULL || !PyUnicode_Check(s))
9639 Py_FatalError(
9640 "PyUnicode_InternInPlace: unicode strings only please!");
9641 /* If it's a subclass, we don't really know what putting
9642 it in the interned dict might do. */
9643 if (!PyUnicode_CheckExact(s))
9644 return;
9645 if (PyUnicode_CHECK_INTERNED(s))
9646 return;
9647 if (interned == NULL) {
9648 interned = PyDict_New();
9649 if (interned == NULL) {
9650 PyErr_Clear(); /* Don't leave an exception */
9651 return;
9652 }
9653 }
9654 /* It might be that the GetItem call fails even
9655 though the key is present in the dictionary,
9656 namely when this happens during a stack overflow. */
9657 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009659 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009660
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 if (t) {
9662 Py_INCREF(t);
9663 Py_DECREF(*p);
9664 *p = t;
9665 return;
9666 }
Walter Dörwald16807132007-05-25 13:52:07 +00009667
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 PyThreadState_GET()->recursion_critical = 1;
9669 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9670 PyErr_Clear();
9671 PyThreadState_GET()->recursion_critical = 0;
9672 return;
9673 }
9674 PyThreadState_GET()->recursion_critical = 0;
9675 /* The two references in interned are not counted by refcnt.
9676 The deallocator will take care of this */
9677 Py_REFCNT(s) -= 2;
9678 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009679}
9680
9681void
9682PyUnicode_InternImmortal(PyObject **p)
9683{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684 PyUnicode_InternInPlace(p);
9685 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9686 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9687 Py_INCREF(*p);
9688 }
Walter Dörwald16807132007-05-25 13:52:07 +00009689}
9690
9691PyObject *
9692PyUnicode_InternFromString(const char *cp)
9693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 PyObject *s = PyUnicode_FromString(cp);
9695 if (s == NULL)
9696 return NULL;
9697 PyUnicode_InternInPlace(&s);
9698 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009699}
9700
9701void _Py_ReleaseInternedUnicodeStrings(void)
9702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 PyObject *keys;
9704 PyUnicodeObject *s;
9705 Py_ssize_t i, n;
9706 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009707
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 if (interned == NULL || !PyDict_Check(interned))
9709 return;
9710 keys = PyDict_Keys(interned);
9711 if (keys == NULL || !PyList_Check(keys)) {
9712 PyErr_Clear();
9713 return;
9714 }
Walter Dörwald16807132007-05-25 13:52:07 +00009715
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9717 detector, interned unicode strings are not forcibly deallocated;
9718 rather, we give them their stolen references back, and then clear
9719 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009720
Benjamin Peterson14339b62009-01-31 16:36:08 +00009721 n = PyList_GET_SIZE(keys);
9722 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009724 for (i = 0; i < n; i++) {
9725 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9726 switch (s->state) {
9727 case SSTATE_NOT_INTERNED:
9728 /* XXX Shouldn't happen */
9729 break;
9730 case SSTATE_INTERNED_IMMORTAL:
9731 Py_REFCNT(s) += 1;
9732 immortal_size += s->length;
9733 break;
9734 case SSTATE_INTERNED_MORTAL:
9735 Py_REFCNT(s) += 2;
9736 mortal_size += s->length;
9737 break;
9738 default:
9739 Py_FatalError("Inconsistent interned string state.");
9740 }
9741 s->state = SSTATE_NOT_INTERNED;
9742 }
9743 fprintf(stderr, "total size of all interned strings: "
9744 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9745 "mortal/immortal\n", mortal_size, immortal_size);
9746 Py_DECREF(keys);
9747 PyDict_Clear(interned);
9748 Py_DECREF(interned);
9749 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009750}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009751
9752
9753/********************* Unicode Iterator **************************/
9754
9755typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009756 PyObject_HEAD
9757 Py_ssize_t it_index;
9758 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009759} unicodeiterobject;
9760
9761static void
9762unicodeiter_dealloc(unicodeiterobject *it)
9763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 _PyObject_GC_UNTRACK(it);
9765 Py_XDECREF(it->it_seq);
9766 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009767}
9768
9769static int
9770unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9771{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009772 Py_VISIT(it->it_seq);
9773 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009774}
9775
9776static PyObject *
9777unicodeiter_next(unicodeiterobject *it)
9778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009779 PyUnicodeObject *seq;
9780 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009781
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782 assert(it != NULL);
9783 seq = it->it_seq;
9784 if (seq == NULL)
9785 return NULL;
9786 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009787
Benjamin Peterson14339b62009-01-31 16:36:08 +00009788 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9789 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009790 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 if (item != NULL)
9792 ++it->it_index;
9793 return item;
9794 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009795
Benjamin Peterson14339b62009-01-31 16:36:08 +00009796 Py_DECREF(seq);
9797 it->it_seq = NULL;
9798 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009799}
9800
9801static PyObject *
9802unicodeiter_len(unicodeiterobject *it)
9803{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009804 Py_ssize_t len = 0;
9805 if (it->it_seq)
9806 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9807 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009808}
9809
9810PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9811
9812static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009815 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009816};
9817
9818PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009819 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9820 "str_iterator", /* tp_name */
9821 sizeof(unicodeiterobject), /* tp_basicsize */
9822 0, /* tp_itemsize */
9823 /* methods */
9824 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9825 0, /* tp_print */
9826 0, /* tp_getattr */
9827 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009828 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 0, /* tp_repr */
9830 0, /* tp_as_number */
9831 0, /* tp_as_sequence */
9832 0, /* tp_as_mapping */
9833 0, /* tp_hash */
9834 0, /* tp_call */
9835 0, /* tp_str */
9836 PyObject_GenericGetAttr, /* tp_getattro */
9837 0, /* tp_setattro */
9838 0, /* tp_as_buffer */
9839 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9840 0, /* tp_doc */
9841 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9842 0, /* tp_clear */
9843 0, /* tp_richcompare */
9844 0, /* tp_weaklistoffset */
9845 PyObject_SelfIter, /* tp_iter */
9846 (iternextfunc)unicodeiter_next, /* tp_iternext */
9847 unicodeiter_methods, /* tp_methods */
9848 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009849};
9850
9851static PyObject *
9852unicode_iter(PyObject *seq)
9853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009855
Benjamin Peterson14339b62009-01-31 16:36:08 +00009856 if (!PyUnicode_Check(seq)) {
9857 PyErr_BadInternalCall();
9858 return NULL;
9859 }
9860 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9861 if (it == NULL)
9862 return NULL;
9863 it->it_index = 0;
9864 Py_INCREF(seq);
9865 it->it_seq = (PyUnicodeObject *)seq;
9866 _PyObject_GC_TRACK(it);
9867 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009868}
9869
Martin v. Löwis5b222132007-06-10 09:51:05 +00009870size_t
9871Py_UNICODE_strlen(const Py_UNICODE *u)
9872{
9873 int res = 0;
9874 while(*u++)
9875 res++;
9876 return res;
9877}
9878
9879Py_UNICODE*
9880Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9881{
9882 Py_UNICODE *u = s1;
9883 while ((*u++ = *s2++));
9884 return s1;
9885}
9886
9887Py_UNICODE*
9888Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9889{
9890 Py_UNICODE *u = s1;
9891 while ((*u++ = *s2++))
9892 if (n-- == 0)
9893 break;
9894 return s1;
9895}
9896
9897int
9898Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9899{
9900 while (*s1 && *s2 && *s1 == *s2)
9901 s1++, s2++;
9902 if (*s1 && *s2)
9903 return (*s1 < *s2) ? -1 : +1;
9904 if (*s1)
9905 return 1;
9906 if (*s2)
9907 return -1;
9908 return 0;
9909}
9910
9911Py_UNICODE*
9912Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9913{
9914 const Py_UNICODE *p;
9915 for (p = s; *p; p++)
9916 if (*p == c)
9917 return (Py_UNICODE*)p;
9918 return NULL;
9919}
9920
9921
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009922#ifdef __cplusplus
9923}
9924#endif
9925
9926
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009927/*
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 Local variables:
9929 c-basic-offset: 4
9930 indent-tabs-mode: nil
9931 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009932*/