blob: e74165a42cc7eb0ae7583baa0770528f5c09a1ae [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
Antoine Pitrou64672132010-01-13 07:55:48 +0000197#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
198#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000199
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000200#define BLOOM_LINEBREAK(ch) \
201 ((ch) < 128U ? ascii_linebreak[(ch)] : \
202 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000203
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000204Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205{
206 /* calculate simple bloom-style bitmask for a given unicode string */
207
208 long mask;
209 Py_ssize_t i;
210
211 mask = 0;
212 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000213 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000214
215 return mask;
216}
217
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000218Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000219{
220 Py_ssize_t i;
221
222 for (i = 0; i < setlen; i++)
223 if (set[i] == chr)
224 return 1;
225
Fredrik Lundh77633512006-05-23 19:47:35 +0000226 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000227}
228
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000229#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232/* --- Unicode Object ----------------------------------------------------- */
233
234static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000235int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000236 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237{
238 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000239
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000242 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000244 /* Resizing shared object (unicode_empty or single character
245 objects) in-place is not allowed. Use PyUnicode_Resize()
246 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000247
Benjamin Peterson857ce152009-01-31 16:29:18 +0000248 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000249 (unicode->length == 1 &&
250 unicode->str[0] < 256U &&
251 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 return -1;
255 }
256
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000257 /* We allocate one more byte to make sure the string is Ux0000 terminated.
258 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000259 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000260 it contains). */
261
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000263 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000264 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000266 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_NoMemory();
268 return -1;
269 }
270 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000271 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000273 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000275 if (unicode->defenc) {
276 Py_DECREF(unicode->defenc);
277 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000280
Guido van Rossumd57fd912000-03-10 22:53:23 +0000281 return 0;
282}
283
284/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000285 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286
287 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000288 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289
290*/
291
292static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000293PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
295 register PyUnicodeObject *unicode;
296
Andrew Dalkee0df7622006-05-27 11:04:36 +0000297 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 if (length == 0 && unicode_empty != NULL) {
299 Py_INCREF(unicode_empty);
300 return unicode_empty;
301 }
302
Neal Norwitze7d8be82008-07-31 17:17:14 +0000303 /* Ensure we won't overflow the size. */
304 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
305 return (PyUnicodeObject *)PyErr_NoMemory();
306 }
307
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000309 if (free_list) {
310 unicode = free_list;
311 free_list = *(PyUnicodeObject **)unicode;
312 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000313 if (unicode->str) {
314 /* Keep-Alive optimization: we only upsize the buffer,
315 never downsize it. */
316 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000317 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000318 PyObject_DEL(unicode->str);
319 unicode->str = NULL;
320 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000321 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000322 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000323 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
324 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000325 }
326 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 }
328 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000329 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000330 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 if (unicode == NULL)
332 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000333 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
334 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000337 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000338 PyErr_NoMemory();
339 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000340 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000341 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000342 * the caller fails before initializing str -- unicode_resize()
343 * reads str[0], and the Keep-Alive optimization can keep memory
344 * allocated for str alive across a call to unicode_dealloc(unicode).
345 * We don't want unicode_resize to read uninitialized memory in
346 * that case.
347 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000348 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000350 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000352 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000355 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000356 /* XXX UNREF/NEWREF interface should be more symmetrical */
357 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000358 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000359 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361}
362
363static
Guido van Rossum9475a232001-10-05 20:51:39 +0000364void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000366 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000368 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000369 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
370 PyObject_DEL(unicode->str);
371 unicode->str = NULL;
372 unicode->length = 0;
373 }
374 if (unicode->defenc) {
375 Py_DECREF(unicode->defenc);
376 unicode->defenc = NULL;
377 }
378 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000379 *(PyUnicodeObject **)unicode = free_list;
380 free_list = unicode;
381 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382 }
383 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000384 PyObject_DEL(unicode->str);
385 Py_XDECREF(unicode->defenc);
386 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
388}
389
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000390static
391int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392{
393 register PyUnicodeObject *v;
394
395 /* Argument checks */
396 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000397 PyErr_BadInternalCall();
398 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000399 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000401 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000402 PyErr_BadInternalCall();
403 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404 }
405
406 /* Resizing unicode_empty and single character objects is not
407 possible since these are being shared. We simply return a fresh
408 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000409 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000410 (v == unicode_empty || v->length == 1)) {
411 PyUnicodeObject *w = _PyUnicode_New(length);
412 if (w == NULL)
413 return -1;
414 Py_UNICODE_COPY(w->str, v->str,
415 length < v->length ? length : v->length);
416 Py_DECREF(*unicode);
417 *unicode = w;
418 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000419 }
420
421 /* Note that we don't have to modify *unicode for unshared Unicode
422 objects, since we can modify them in-place. */
423 return unicode_resize(v, length);
424}
425
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000426int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
427{
428 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
429}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000430
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000432 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433{
434 PyUnicodeObject *unicode;
435
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436 /* If the Unicode data is known at construction time, we can apply
437 some optimizations which share commonly used objects. */
438 if (u != NULL) {
439
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000440 /* Optimization for empty strings */
441 if (size == 0 && unicode_empty != NULL) {
442 Py_INCREF(unicode_empty);
443 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000444 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000445
446 /* Single character Unicode objects in the Latin-1 range are
447 shared when using this constructor */
448 if (size == 1 && *u < 256) {
449 unicode = unicode_latin1[*u];
450 if (!unicode) {
451 unicode = _PyUnicode_New(1);
452 if (!unicode)
453 return NULL;
454 unicode->str[0] = *u;
455 unicode_latin1[*u] = unicode;
456 }
457 Py_INCREF(unicode);
458 return (PyObject *)unicode;
459 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
Tim Petersced69f82003-09-16 20:30:58 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 unicode = _PyUnicode_New(size);
463 if (!unicode)
464 return NULL;
465
466 /* Copy the Unicode data into the new object */
467 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000468 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469
470 return (PyObject *)unicode;
471}
472
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000473PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
474{
475 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000476
Benjamin Peterson857ce152009-01-31 16:29:18 +0000477 if (size < 0) {
478 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000479 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000480 return NULL;
481 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483 /* If the Unicode data is known at construction time, we can apply
484 some optimizations which share commonly used objects.
485 Also, this means the input must be UTF-8, so fall back to the
486 UTF-8 decoder at the end. */
487 if (u != NULL) {
488
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 /* Optimization for empty strings */
490 if (size == 0 && unicode_empty != NULL) {
491 Py_INCREF(unicode_empty);
492 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000493 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000494
495 /* Single characters are shared when using this constructor.
496 Restrict to ASCII, since the input must be UTF-8. */
497 if (size == 1 && Py_CHARMASK(*u) < 128) {
498 unicode = unicode_latin1[Py_CHARMASK(*u)];
499 if (!unicode) {
500 unicode = _PyUnicode_New(1);
501 if (!unicode)
502 return NULL;
503 unicode->str[0] = Py_CHARMASK(*u);
504 unicode_latin1[Py_CHARMASK(*u)] = unicode;
505 }
506 Py_INCREF(unicode);
507 return (PyObject *)unicode;
508 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000509
510 return PyUnicode_DecodeUTF8(u, size, NULL);
511 }
512
513 unicode = _PyUnicode_New(size);
514 if (!unicode)
515 return NULL;
516
517 return (PyObject *)unicode;
518}
519
520PyObject *PyUnicode_FromString(const char *u)
521{
522 size_t size = strlen(u);
523 if (size > PY_SSIZE_T_MAX) {
524 PyErr_SetString(PyExc_OverflowError, "input too long");
525 return NULL;
526 }
527
528 return PyUnicode_FromStringAndSize(u, size);
529}
530
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531#ifdef HAVE_WCHAR_H
532
Mark Dickinson6b265f12009-03-18 16:07:26 +0000533#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
534# define CONVERT_WCHAR_TO_SURROGATES
535#endif
536
537#ifdef CONVERT_WCHAR_TO_SURROGATES
538
539/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
540 to convert from UTF32 to UTF16. */
541
542PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
543 Py_ssize_t size)
544{
545 PyUnicodeObject *unicode;
546 register Py_ssize_t i;
547 Py_ssize_t alloc;
548 const wchar_t *orig_w;
549
550 if (w == NULL) {
551 PyErr_BadInternalCall();
552 return NULL;
553 }
554
555 alloc = size;
556 orig_w = w;
557 for (i = size; i > 0; i--) {
558 if (*w > 0xFFFF)
559 alloc++;
560 w++;
561 }
562 w = orig_w;
563 unicode = _PyUnicode_New(alloc);
564 if (!unicode)
565 return NULL;
566
567 /* Copy the wchar_t data into the new object */
568 {
569 register Py_UNICODE *u;
570 u = PyUnicode_AS_UNICODE(unicode);
571 for (i = size; i > 0; i--) {
572 if (*w > 0xFFFF) {
573 wchar_t ordinal = *w++;
574 ordinal -= 0x10000;
575 *u++ = 0xD800 | (ordinal >> 10);
576 *u++ = 0xDC00 | (ordinal & 0x3FF);
577 }
578 else
579 *u++ = *w++;
580 }
581 }
582 return (PyObject *)unicode;
583}
584
585#else
586
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000588 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589{
590 PyUnicodeObject *unicode;
591
592 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000593 PyErr_BadInternalCall();
594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 }
596
597 unicode = _PyUnicode_New(size);
598 if (!unicode)
599 return NULL;
600
601 /* Copy the wchar_t data into the new object */
602#ifdef HAVE_USABLE_WCHAR_T
603 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000604#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000606 register Py_UNICODE *u;
607 register Py_ssize_t i;
608 u = PyUnicode_AS_UNICODE(unicode);
609 for (i = size; i > 0; i--)
610 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 }
612#endif
613
614 return (PyObject *)unicode;
615}
616
Mark Dickinson6b265f12009-03-18 16:07:26 +0000617#endif /* CONVERT_WCHAR_TO_SURROGATES */
618
619#undef CONVERT_WCHAR_TO_SURROGATES
620
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000621static void
622makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
623{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000624 *fmt++ = '%';
625 if (width) {
626 if (zeropad)
627 *fmt++ = '0';
628 fmt += sprintf(fmt, "%d", width);
629 }
630 if (precision)
631 fmt += sprintf(fmt, ".%d", precision);
632 if (longflag)
633 *fmt++ = 'l';
634 else if (size_tflag) {
635 char *f = PY_FORMAT_SIZE_T;
636 while (*f)
637 *fmt++ = *f++;
638 }
639 *fmt++ = c;
640 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000641}
642
643#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
644
645PyObject *
646PyUnicode_FromFormatV(const char *format, va_list vargs)
647{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000648 va_list count;
649 Py_ssize_t callcount = 0;
650 PyObject **callresults = NULL;
651 PyObject **callresult = NULL;
652 Py_ssize_t n = 0;
653 int width = 0;
654 int precision = 0;
655 int zeropad;
656 const char* f;
657 Py_UNICODE *s;
658 PyObject *string;
659 /* used by sprintf */
660 char buffer[21];
661 /* use abuffer instead of buffer, if we need more space
662 * (which can happen if there's a format specifier with width). */
663 char *abuffer = NULL;
664 char *realbuffer;
665 Py_ssize_t abuffersize = 0;
666 char fmt[60]; /* should be enough for %0width.precisionld */
667 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000668
669#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000670 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671#else
672#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000673 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000674#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000675 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000676#endif
677#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000678 /* step 1: count the number of %S/%R/%s format specifications
679 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
680 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000681 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000682 if (*f == '%') {
683 if (*(f+1)=='%')
684 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000685 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000686 ++callcount;
687 while (isdigit((unsigned)*f))
688 width = (width*10) + *f++ - '0';
689 while (*++f && *f != '%' && !isalpha((unsigned)*f))
690 ;
691 if (*f == 's')
692 ++callcount;
693 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000694 }
695 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000697 if (callcount) {
698 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
699 if (!callresults) {
700 PyErr_NoMemory();
701 return NULL;
702 }
703 callresult = callresults;
704 }
705 /* step 3: figure out how large a buffer we need */
706 for (f = format; *f; f++) {
707 if (*f == '%') {
708 const char* p = f;
709 width = 0;
710 while (isdigit((unsigned)*f))
711 width = (width*10) + *f++ - '0';
712 while (*++f && *f != '%' && !isalpha((unsigned)*f))
713 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000714
Benjamin Peterson857ce152009-01-31 16:29:18 +0000715 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
716 * they don't affect the amount of space we reserve.
717 */
718 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000719 (f[1] == 'd' || f[1] == 'u'))
720 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000721
Benjamin Peterson857ce152009-01-31 16:29:18 +0000722 switch (*f) {
723 case 'c':
724 (void)va_arg(count, int);
725 /* fall through... */
726 case '%':
727 n++;
728 break;
729 case 'd': case 'u': case 'i': case 'x':
730 (void) va_arg(count, int);
731 /* 20 bytes is enough to hold a 64-bit
732 integer. Decimal takes the most space.
733 This isn't enough for octal.
734 If a width is specified we need more
735 (which we allocate later). */
736 if (width < 20)
737 width = 20;
738 n += width;
739 if (abuffersize < width)
740 abuffersize = width;
741 break;
742 case 's':
743 {
744 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000745 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000746 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
747 if (!str)
748 goto fail;
749 n += PyUnicode_GET_SIZE(str);
750 /* Remember the str and switch to the next slot */
751 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000752 break;
753 }
754 case 'U':
755 {
756 PyObject *obj = va_arg(count, PyObject *);
757 assert(obj && PyUnicode_Check(obj));
758 n += PyUnicode_GET_SIZE(obj);
759 break;
760 }
761 case 'V':
762 {
763 PyObject *obj = va_arg(count, PyObject *);
764 const char *str = va_arg(count, const char *);
765 assert(obj || str);
766 assert(!obj || PyUnicode_Check(obj));
767 if (obj)
768 n += PyUnicode_GET_SIZE(obj);
769 else
770 n += strlen(str);
771 break;
772 }
773 case 'S':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 PyObject *str;
777 assert(obj);
778 str = PyObject_Str(obj);
779 if (!str)
780 goto fail;
781 n += PyUnicode_GET_SIZE(str);
782 /* Remember the str and switch to the next slot */
783 *callresult++ = str;
784 break;
785 }
786 case 'R':
787 {
788 PyObject *obj = va_arg(count, PyObject *);
789 PyObject *repr;
790 assert(obj);
791 repr = PyObject_Repr(obj);
792 if (!repr)
793 goto fail;
794 n += PyUnicode_GET_SIZE(repr);
795 /* Remember the repr and switch to the next slot */
796 *callresult++ = repr;
797 break;
798 }
799 case 'p':
800 (void) va_arg(count, int);
801 /* maximum 64-bit pointer representation:
802 * 0xffffffffffffffff
803 * so 19 characters is enough.
804 * XXX I count 18 -- what's the extra for?
805 */
806 n += 19;
807 break;
808 default:
809 /* if we stumble upon an unknown
810 formatting code, copy the rest of
811 the format string to the output
812 string. (we cannot just skip the
813 code, since there's no way to know
814 what's in the argument list) */
815 n += strlen(p);
816 goto expand;
817 }
818 } else
819 n++;
820 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000821 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000822 if (abuffersize > 20) {
823 abuffer = PyObject_Malloc(abuffersize);
824 if (!abuffer) {
825 PyErr_NoMemory();
826 goto fail;
827 }
828 realbuffer = abuffer;
829 }
830 else
831 realbuffer = buffer;
832 /* step 4: fill the buffer */
833 /* Since we've analyzed how much space we need for the worst case,
834 we don't have to resize the string.
835 There can be no errors beyond this point. */
836 string = PyUnicode_FromUnicode(NULL, n);
837 if (!string)
838 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000839
Benjamin Peterson857ce152009-01-31 16:29:18 +0000840 s = PyUnicode_AS_UNICODE(string);
841 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000842
Benjamin Peterson857ce152009-01-31 16:29:18 +0000843 for (f = format; *f; f++) {
844 if (*f == '%') {
845 const char* p = f++;
846 int longflag = 0;
847 int size_tflag = 0;
848 zeropad = (*f == '0');
849 /* parse the width.precision part */
850 width = 0;
851 while (isdigit((unsigned)*f))
852 width = (width*10) + *f++ - '0';
853 precision = 0;
854 if (*f == '.') {
855 f++;
856 while (isdigit((unsigned)*f))
857 precision = (precision*10) + *f++ - '0';
858 }
859 /* handle the long flag, but only for %ld and %lu.
860 others can be added when necessary. */
861 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
862 longflag = 1;
863 ++f;
864 }
865 /* handle the size_t flag. */
866 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
867 size_tflag = 1;
868 ++f;
869 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000870
Benjamin Peterson857ce152009-01-31 16:29:18 +0000871 switch (*f) {
872 case 'c':
873 *s++ = va_arg(vargs, int);
874 break;
875 case 'd':
876 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
877 if (longflag)
878 sprintf(realbuffer, fmt, va_arg(vargs, long));
879 else if (size_tflag)
880 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
881 else
882 sprintf(realbuffer, fmt, va_arg(vargs, int));
883 appendstring(realbuffer);
884 break;
885 case 'u':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
893 appendstring(realbuffer);
894 break;
895 case 'i':
896 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
897 sprintf(realbuffer, fmt, va_arg(vargs, int));
898 appendstring(realbuffer);
899 break;
900 case 'x':
901 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
904 break;
905 case 's':
906 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000907 /* unused, since we already have the result */
908 (void) va_arg(vargs, char *);
909 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
910 PyUnicode_GET_SIZE(*callresult));
911 s += PyUnicode_GET_SIZE(*callresult);
912 /* We're done with the unicode()/repr() => forget it */
913 Py_DECREF(*callresult);
914 /* switch to next unicode()/repr() result */
915 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000916 break;
917 }
918 case 'U':
919 {
920 PyObject *obj = va_arg(vargs, PyObject *);
921 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
922 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
923 s += size;
924 break;
925 }
926 case 'V':
927 {
928 PyObject *obj = va_arg(vargs, PyObject *);
929 const char *str = va_arg(vargs, const char *);
930 if (obj) {
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 } else {
935 appendstring(str);
936 }
937 break;
938 }
939 case 'S':
940 case 'R':
941 {
942 Py_UNICODE *ucopy;
943 Py_ssize_t usize;
944 Py_ssize_t upos;
945 /* unused, since we already have the result */
946 (void) va_arg(vargs, PyObject *);
947 ucopy = PyUnicode_AS_UNICODE(*callresult);
948 usize = PyUnicode_GET_SIZE(*callresult);
949 for (upos = 0; upos<usize;)
950 *s++ = ucopy[upos++];
951 /* We're done with the unicode()/repr() => forget it */
952 Py_DECREF(*callresult);
953 /* switch to next unicode()/repr() result */
954 ++callresult;
955 break;
956 }
957 case 'p':
958 sprintf(buffer, "%p", va_arg(vargs, void*));
959 /* %p is ill-defined: ensure leading 0x. */
960 if (buffer[1] == 'X')
961 buffer[1] = 'x';
962 else if (buffer[1] != 'x') {
963 memmove(buffer+2, buffer, strlen(buffer)+1);
964 buffer[0] = '0';
965 buffer[1] = 'x';
966 }
967 appendstring(buffer);
968 break;
969 case '%':
970 *s++ = '%';
971 break;
972 default:
973 appendstring(p);
974 goto end;
975 }
976 } else
977 *s++ = *f;
978 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000979
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000980 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000981 if (callresults)
982 PyObject_Free(callresults);
983 if (abuffer)
984 PyObject_Free(abuffer);
985 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
986 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000987 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000988 if (callresults) {
989 PyObject **callresult2 = callresults;
990 while (callresult2 < callresult) {
991 Py_DECREF(*callresult2);
992 ++callresult2;
993 }
994 PyObject_Free(callresults);
995 }
996 if (abuffer)
997 PyObject_Free(abuffer);
998 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000999}
1000
1001#undef appendstring
1002
1003PyObject *
1004PyUnicode_FromFormat(const char *format, ...)
1005{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001006 PyObject* ret;
1007 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001008
1009#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001010 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001012 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001013#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001014 ret = PyUnicode_FromFormatV(format, vargs);
1015 va_end(vargs);
1016 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001017}
1018
Martin v. Löwis18e16552006-02-15 17:27:45 +00001019Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001020 wchar_t *w,
1021 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022{
1023 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001024 PyErr_BadInternalCall();
1025 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001027
1028 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001031
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032#ifdef HAVE_USABLE_WCHAR_T
1033 memcpy(w, unicode->str, size * sizeof(wchar_t));
1034#else
1035 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 register Py_UNICODE *u;
1037 register Py_ssize_t i;
1038 u = PyUnicode_AS_UNICODE(unicode);
1039 for (i = size; i > 0; i--)
1040 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 }
1042#endif
1043
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001044 if (size > PyUnicode_GET_SIZE(unicode))
1045 return PyUnicode_GET_SIZE(unicode);
1046 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001047 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048}
1049
1050#endif
1051
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001052PyObject *PyUnicode_FromOrdinal(int ordinal)
1053{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001054 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001055
1056#ifdef Py_UNICODE_WIDE
1057 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001058 PyErr_SetString(PyExc_ValueError,
1059 "unichr() arg not in range(0x110000) "
1060 "(wide Python build)");
1061 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062 }
1063#else
1064 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001065 PyErr_SetString(PyExc_ValueError,
1066 "unichr() arg not in range(0x10000) "
1067 "(narrow Python build)");
1068 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001069 }
1070#endif
1071
Hye-Shik Chang40574832004-04-06 07:24:51 +00001072 s[0] = (Py_UNICODE)ordinal;
1073 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074}
1075
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076PyObject *PyUnicode_FromObject(register PyObject *obj)
1077{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001078 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001079 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001080 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001081 Py_INCREF(obj);
1082 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001083 }
1084 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001085 /* For a Unicode subtype that's not a Unicode object,
1086 return a true Unicode object with the same data. */
1087 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1088 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001089 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001090 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1091}
1092
1093PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001094 const char *encoding,
1095 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001096{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001097 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001098 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001099 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001102 PyErr_BadInternalCall();
1103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001106#if 0
1107 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001108 that no encodings is given and then redirect to
1109 PyObject_Unicode() which then applies the additional logic for
1110 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001111
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001112 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001113 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001114
1115 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001116 if (PyUnicode_Check(obj)) {
1117 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001118 PyErr_SetString(PyExc_TypeError,
1119 "decoding Unicode is not supported");
1120 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001121 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001122 return PyObject_Unicode(obj);
1123 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124#else
1125 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001126 PyErr_SetString(PyExc_TypeError,
1127 "decoding Unicode is not supported");
1128 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001129 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001130#endif
1131
1132 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001133 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001134 s = PyString_AS_STRING(obj);
1135 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001136 }
Christian Heimes3497f942008-05-26 12:29:14 +00001137 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001138 /* Python 2.x specific */
1139 PyErr_Format(PyExc_TypeError,
1140 "decoding bytearray is not supported");
1141 return NULL;
1142 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001144 /* Overwrite the error message with something more useful in
1145 case of a TypeError. */
1146 if (PyErr_ExceptionMatches(PyExc_TypeError))
1147 PyErr_Format(PyExc_TypeError,
1148 "coercing to Unicode: need string or buffer, "
1149 "%.80s found",
1150 Py_TYPE(obj)->tp_name);
1151 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001152 }
Tim Petersced69f82003-09-16 20:30:58 +00001153
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 Py_INCREF(unicode_empty);
1157 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001160 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001161
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001162 return v;
1163
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001164 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166}
1167
1168PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001169 Py_ssize_t size,
1170 const char *encoding,
1171 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172{
1173 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001174
1175 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001177
1178 /* Shortcuts for common default encodings */
1179 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001181 else if (strcmp(encoding, "latin-1") == 0)
1182 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001183#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1184 else if (strcmp(encoding, "mbcs") == 0)
1185 return PyUnicode_DecodeMBCS(s, size, errors);
1186#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001187 else if (strcmp(encoding, "ascii") == 0)
1188 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189
1190 /* Decode via the codec registry */
1191 buffer = PyBuffer_FromMemory((void *)s, size);
1192 if (buffer == NULL)
1193 goto onError;
1194 unicode = PyCodec_Decode(buffer, encoding, errors);
1195 if (unicode == NULL)
1196 goto onError;
1197 if (!PyUnicode_Check(unicode)) {
1198 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001199 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001200 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201 Py_DECREF(unicode);
1202 goto onError;
1203 }
1204 Py_DECREF(buffer);
1205 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001206
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 Py_XDECREF(buffer);
1209 return NULL;
1210}
1211
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001212PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1213 const char *encoding,
1214 const char *errors)
1215{
1216 PyObject *v;
1217
1218 if (!PyUnicode_Check(unicode)) {
1219 PyErr_BadArgument();
1220 goto onError;
1221 }
1222
1223 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001224 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001225
1226 /* Decode via the codec registry */
1227 v = PyCodec_Decode(unicode, encoding, errors);
1228 if (v == NULL)
1229 goto onError;
1230 return v;
1231
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001232 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001233 return NULL;
1234}
1235
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240{
1241 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001242
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 unicode = PyUnicode_FromUnicode(s, size);
1244 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1247 Py_DECREF(unicode);
1248 return v;
1249}
1250
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001251PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1252 const char *encoding,
1253 const char *errors)
1254{
1255 PyObject *v;
1256
1257 if (!PyUnicode_Check(unicode)) {
1258 PyErr_BadArgument();
1259 goto onError;
1260 }
1261
1262 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001263 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001264
1265 /* Encode via the codec registry */
1266 v = PyCodec_Encode(unicode, encoding, errors);
1267 if (v == NULL)
1268 goto onError;
1269 return v;
1270
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001271 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001272 return NULL;
1273}
1274
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1276 const char *encoding,
1277 const char *errors)
1278{
1279 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001280
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 if (!PyUnicode_Check(unicode)) {
1282 PyErr_BadArgument();
1283 goto onError;
1284 }
Fred Drakee4315f52000-05-09 19:53:39 +00001285
Tim Petersced69f82003-09-16 20:30:58 +00001286 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001287 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001288
1289 /* Shortcuts for common default encodings */
1290 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001291 if (strcmp(encoding, "utf-8") == 0)
1292 return PyUnicode_AsUTF8String(unicode);
1293 else if (strcmp(encoding, "latin-1") == 0)
1294 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001295#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001296 else if (strcmp(encoding, "mbcs") == 0)
1297 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001298#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001299 else if (strcmp(encoding, "ascii") == 0)
1300 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302
1303 /* Encode via the codec registry */
1304 v = PyCodec_Encode(unicode, encoding, errors);
1305 if (v == NULL)
1306 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001307 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001309 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001310 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 Py_DECREF(v);
1312 goto onError;
1313 }
1314 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 return NULL;
1318}
1319
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001320PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001321 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001322{
1323 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1324
1325 if (v)
1326 return v;
1327 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1328 if (v && errors == NULL)
1329 ((PyUnicodeObject *)unicode)->defenc = v;
1330 return v;
1331}
1332
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1334{
1335 if (!PyUnicode_Check(unicode)) {
1336 PyErr_BadArgument();
1337 goto onError;
1338 }
1339 return PyUnicode_AS_UNICODE(unicode);
1340
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001341 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342 return NULL;
1343}
1344
Martin v. Löwis18e16552006-02-15 17:27:45 +00001345Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_GET_SIZE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return -1;
1355}
1356
Thomas Wouters78890102000-07-22 19:25:51 +00001357const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001358{
1359 return unicode_default_encoding;
1360}
1361
1362int PyUnicode_SetDefaultEncoding(const char *encoding)
1363{
1364 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001365
Fred Drakee4315f52000-05-09 19:53:39 +00001366 /* Make sure the encoding is valid. As side effect, this also
1367 loads the encoding into the codec registry cache. */
1368 v = _PyCodec_Lookup(encoding);
1369 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001370 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001371 Py_DECREF(v);
1372 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001373 encoding,
1374 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001375 return 0;
1376
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001377 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001378 return -1;
1379}
1380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381/* error handling callback helper:
1382 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001383 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001384 and adjust various state variables.
1385 return 0 on success, -1 on error
1386*/
1387
1388static
1389int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001390 const char *encoding, const char *reason,
1391 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1392 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1393 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001395 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396
1397 PyObject *restuple = NULL;
1398 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001399 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1400 Py_ssize_t requiredsize;
1401 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404 int res = -1;
1405
1406 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001407 *errorHandler = PyCodec_LookupError(errors);
1408 if (*errorHandler == NULL)
1409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 }
1411
1412 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001413 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001414 encoding, input, insize, *startinpos, *endinpos, reason);
1415 if (*exceptionObject == NULL)
1416 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 }
1418 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1420 goto onError;
1421 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1422 goto onError;
1423 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 }
1426
1427 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1428 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001430 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001431 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 }
1434 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001435 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001437 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001438 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1440 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001441 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442
1443 /* need more space? (at least enough for what we
1444 have+the replacement+the rest of the string (starting
1445 at the new input position), so we won't have to check space
1446 when there are no errors in the rest of the string) */
1447 repptr = PyUnicode_AS_UNICODE(repunicode);
1448 repsize = PyUnicode_GET_SIZE(repunicode);
1449 requiredsize = *outpos + repsize + insize-newpos;
1450 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 if (requiredsize<2*outsize)
1452 requiredsize = 2*outsize;
1453 if (_PyUnicode_Resize(output, requiredsize) < 0)
1454 goto onError;
1455 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 }
1457 *endinpos = newpos;
1458 *inptr = input + newpos;
1459 Py_UNICODE_COPY(*outptr, repptr, repsize);
1460 *outptr += repsize;
1461 *outpos += repsize;
1462 /* we made it! */
1463 res = 0;
1464
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001465 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 Py_XDECREF(restuple);
1467 return res;
1468}
1469
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470/* --- UTF-7 Codec -------------------------------------------------------- */
1471
Antoine Pitrou653dece2009-05-04 18:32:32 +00001472/* See RFC2152 for details. We encode conservatively and decode liberally. */
1473
1474/* Three simple macros defining base-64. */
1475
1476/* Is c a base-64 character? */
1477
1478#define IS_BASE64(c) \
1479 (isalnum(c) || (c) == '+' || (c) == '/')
1480
1481/* given that c is a base-64 character, what is its base-64 value? */
1482
1483#define FROM_BASE64(c) \
1484 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1485 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1486 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1487 (c) == '+' ? 62 : 63)
1488
1489/* What is the base-64 character of the bottom 6 bits of n? */
1490
1491#define TO_BASE64(n) \
1492 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493
1494/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1495 * decoded as itself. We are permissive on decoding; the only ASCII
1496 * byte not decoding to itself is the + which begins a base64
1497 * string. */
1498
1499#define DECODE_DIRECT(c) \
1500 ((c) <= 127 && (c) != '+')
1501
1502/* The UTF-7 encoder treats ASCII characters differently according to
1503 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1504 * the above). See RFC2152. This array identifies these different
1505 * sets:
1506 * 0 : "Set D"
1507 * alphanumeric and '(),-./:?
1508 * 1 : "Set O"
1509 * !"#$%&*;<=>@[]^_`{|}
1510 * 2 : "whitespace"
1511 * ht nl cr sp
1512 * 3 : special (must be base64 encoded)
1513 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1514 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515
Tim Petersced69f82003-09-16 20:30:58 +00001516static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001517char utf7_category[128] = {
1518/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1519 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1520/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1521 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1522/* sp ! " # $ % & ' ( ) * + , - . / */
1523 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1524/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1526/* @ A B C D E F G H I J K L M N O */
1527 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1528/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1530/* ` a b c d e f g h i j k l m n o */
1531 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1532/* p q r s t u v w x y z { | } ~ del */
1533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534};
1535
Antoine Pitrou653dece2009-05-04 18:32:32 +00001536/* ENCODE_DIRECT: this character should be encoded as itself. The
1537 * answer depends on whether we are encoding set O as itself, and also
1538 * on whether we are encoding whitespace as itself. RFC2152 makes it
1539 * clear that the answers to these questions vary between
1540 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001541
Antoine Pitrou653dece2009-05-04 18:32:32 +00001542#define ENCODE_DIRECT(c, directO, directWS) \
1543 ((c) < 128 && (c) > 0 && \
1544 ((utf7_category[(c)] == 0) || \
1545 (directWS && (utf7_category[(c)] == 2)) || \
1546 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001549 Py_ssize_t size,
1550 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001552 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1553}
1554
Antoine Pitrou653dece2009-05-04 18:32:32 +00001555/* The decoder. The only state we preserve is our read position,
1556 * i.e. how many characters we have consumed. So if we end in the
1557 * middle of a shift sequence we have to back off the read position
1558 * and the output to the beginning of the sequence, otherwise we lose
1559 * all the shift state (seen bits, number of bits seen, high
1560 * surrogate). */
1561
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001563 Py_ssize_t size,
1564 const char *errors,
1565 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001568 Py_ssize_t startinpos;
1569 Py_ssize_t endinpos;
1570 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571 const char *e;
1572 PyUnicodeObject *unicode;
1573 Py_UNICODE *p;
1574 const char *errmsg = "";
1575 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001576 Py_UNICODE *shiftOutStart;
1577 unsigned int base64bits = 0;
1578 unsigned long base64buffer = 0;
1579 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001580 PyObject *errorHandler = NULL;
1581 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582
1583 unicode = _PyUnicode_New(size);
1584 if (!unicode)
1585 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001586 if (size == 0) {
1587 if (consumed)
1588 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591
1592 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001593 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 e = s + size;
1595
1596 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001597 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598
Antoine Pitrou653dece2009-05-04 18:32:32 +00001599 if (inShift) { /* in a base-64 section */
1600 if (IS_BASE64(ch)) { /* consume a base-64 character */
1601 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1602 base64bits += 6;
1603 s++;
1604 if (base64bits >= 16) {
1605 /* we have enough bits for a UTF-16 value */
1606 Py_UNICODE outCh = (Py_UNICODE)
1607 (base64buffer >> (base64bits-16));
1608 base64bits -= 16;
1609 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1610 if (surrogate) {
1611 /* expecting a second surrogate */
1612 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1613#ifdef Py_UNICODE_WIDE
1614 *p++ = (((surrogate & 0x3FF)<<10)
1615 | (outCh & 0x3FF)) + 0x10000;
1616#else
1617 *p++ = surrogate;
1618 *p++ = outCh;
1619#endif
1620 surrogate = 0;
1621 }
1622 else {
1623 surrogate = 0;
1624 errmsg = "second surrogate missing";
1625 goto utf7Error;
1626 }
1627 }
1628 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1629 /* first surrogate */
1630 surrogate = outCh;
1631 }
1632 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1633 errmsg = "unexpected second surrogate";
1634 goto utf7Error;
1635 }
1636 else {
1637 *p++ = outCh;
1638 }
1639 }
1640 }
1641 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 inShift = 0;
1643 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001644 if (surrogate) {
1645 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001646 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001648 if (base64bits > 0) { /* left-over bits */
1649 if (base64bits >= 6) {
1650 /* We've seen at least one base-64 character */
1651 errmsg = "partial character in shift sequence";
1652 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 else {
1655 /* Some bits remain; they should be zero */
1656 if (base64buffer != 0) {
1657 errmsg = "non-zero padding bits in shift sequence";
1658 goto utf7Error;
1659 }
1660 }
1661 }
1662 if (ch != '-') {
1663 /* '-' is absorbed; other terminating
1664 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 *p++ = ch;
1666 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001667 }
1668 }
1669 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001670 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001671 s++; /* consume '+' */
1672 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 s++;
1674 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001675 }
1676 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001678 shiftOutStart = p;
1679 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 }
1681 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001682 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 *p++ = ch;
1684 s++;
1685 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001686 else {
1687 startinpos = s-starts;
1688 s++;
1689 errmsg = "unexpected special character";
1690 goto utf7Error;
1691 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001693utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001694 outpos = p-PyUnicode_AS_UNICODE(unicode);
1695 endinpos = s-starts;
1696 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001697 errors, &errorHandler,
1698 "utf7", errmsg,
1699 starts, size, &startinpos, &endinpos, &exc, &s,
1700 &unicode, &outpos, &p))
1701 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 }
1703
Antoine Pitrou653dece2009-05-04 18:32:32 +00001704 /* end of string */
1705
1706 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1707 /* if we're in an inconsistent state, that's an error */
1708 if (surrogate ||
1709 (base64bits >= 6) ||
1710 (base64bits > 0 && base64buffer != 0)) {
1711 outpos = p-PyUnicode_AS_UNICODE(unicode);
1712 endinpos = size;
1713 if (unicode_decode_call_errorhandler(
1714 errors, &errorHandler,
1715 "utf7", "unterminated shift sequence",
1716 starts, size, &startinpos, &endinpos, &exc, &s,
1717 &unicode, &outpos, &p))
1718 goto onError;
1719 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001720 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001721
1722 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001723 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001724 if (inShift) {
1725 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001726 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001727 }
1728 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001729 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001733 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 goto onError;
1735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 Py_XDECREF(errorHandler);
1737 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 return (PyObject *)unicode;
1739
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001741 Py_XDECREF(errorHandler);
1742 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 Py_DECREF(unicode);
1744 return NULL;
1745}
1746
1747
1748PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001749 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001750 int base64SetO,
1751 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753{
1754 PyObject *v;
1755 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001756 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001759 unsigned int base64bits = 0;
1760 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 char * out;
1762 char * start;
1763
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001764 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001765 return PyErr_NoMemory();
1766
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001768 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769
Antoine Pitrou653dece2009-05-04 18:32:32 +00001770 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 if (v == NULL)
1772 return NULL;
1773
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001774 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775 for (;i < size; ++i) {
1776 Py_UNICODE ch = s[i];
1777
Antoine Pitrou653dece2009-05-04 18:32:32 +00001778 if (inShift) {
1779 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1780 /* shifting out */
1781 if (base64bits) { /* output remaining bits */
1782 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1783 base64buffer = 0;
1784 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 }
1786 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001787 /* Characters not in the BASE64 set implicitly unshift the sequence
1788 so no '-' is required, except if the character is itself a '-' */
1789 if (IS_BASE64(ch) || ch == '-') {
1790 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001791 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001792 *out++ = (char) ch;
1793 }
1794 else {
1795 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001796 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798 else { /* not in a shift sequence */
1799 if (ch == '+') {
1800 *out++ = '+';
1801 *out++ = '-';
1802 }
1803 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1804 *out++ = (char) ch;
1805 }
1806 else {
1807 *out++ = '+';
1808 inShift = 1;
1809 goto encode_char;
1810 }
1811 }
1812 continue;
1813encode_char:
1814#ifdef Py_UNICODE_WIDE
1815 if (ch >= 0x10000) {
1816 /* code first surrogate */
1817 base64bits += 16;
1818 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1819 while (base64bits >= 6) {
1820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1821 base64bits -= 6;
1822 }
1823 /* prepare second surrogate */
1824 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1825 }
1826#endif
1827 base64bits += 16;
1828 base64buffer = (base64buffer << 16) | ch;
1829 while (base64bits >= 6) {
1830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831 base64bits -= 6;
1832 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001833 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001834 if (base64bits)
1835 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1836 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001839 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840 return v;
1841}
1842
Antoine Pitrou653dece2009-05-04 18:32:32 +00001843#undef IS_BASE64
1844#undef FROM_BASE64
1845#undef TO_BASE64
1846#undef DECODE_DIRECT
1847#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849/* --- UTF-8 Codec -------------------------------------------------------- */
1850
Tim Petersced69f82003-09-16 20:30:58 +00001851static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852char utf8_code_length[256] = {
1853 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1854 illegal prefix. see RFC 2279 for details */
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1869 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1870 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1871};
1872
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001874 Py_ssize_t size,
1875 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876{
Walter Dörwald69652032004-09-07 20:24:22 +00001877 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1878}
1879
1880PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001881 Py_ssize_t size,
1882 const char *errors,
1883 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t startinpos;
1888 Py_ssize_t endinpos;
1889 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 const char *e;
1891 PyUnicodeObject *unicode;
1892 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 PyObject *errorHandler = NULL;
1895 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896
1897 /* Note: size will always be longer than the resulting Unicode
1898 character count */
1899 unicode = _PyUnicode_New(size);
1900 if (!unicode)
1901 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001902 if (size == 0) {
1903 if (consumed)
1904 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
1908 /* Unpack UTF-8 encoded data */
1909 p = unicode->str;
1910 e = s + size;
1911
1912 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914
1915 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 s++;
1918 continue;
1919 }
1920
1921 n = utf8_code_length[ch];
1922
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001924 if (consumed)
1925 break;
1926 else {
1927 errmsg = "unexpected end of data";
1928 startinpos = s-starts;
1929 endinpos = size;
1930 goto utf8Error;
1931 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933
1934 switch (n) {
1935
1936 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 startinpos = s-starts;
1939 endinpos = startinpos+1;
1940 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941
1942 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001943 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+1;
1946 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947
1948 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001949 if ((s[1] & 0xc0) != 0x80) {
1950 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001951 startinpos = s-starts;
1952 endinpos = startinpos+2;
1953 goto utf8Error;
1954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 goto utf8Error;
1961 }
1962 else
1963 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 break;
1965
1966 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001967 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 (s[2] & 0xc0) != 0x80) {
1969 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001970 startinpos = s-starts;
1971 endinpos = startinpos+3;
1972 goto utf8Error;
1973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001975 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001976 /* Note: UTF-8 encodings of surrogates are considered
1977 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001978
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001979 XXX For wide builds (UCS-4) we should probably try
1980 to recombine the surrogates into a single code
1981 unit.
1982 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001983 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001984 startinpos = s-starts;
1985 endinpos = startinpos+3;
1986 goto utf8Error;
1987 }
1988 else
1989 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001990 break;
1991
1992 case 4:
1993 if ((s[1] & 0xc0) != 0x80 ||
1994 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 (s[3] & 0xc0) != 0x80) {
1996 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+4;
1999 goto utf8Error;
2000 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002001 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002002 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002005 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002007 UTF-16 */
2008 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002009 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+4;
2012 goto utf8Error;
2013 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002014#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002016#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002018
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002019 /* translate from 10000..10FFFF to 0..FFFF */
2020 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002021
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002022 /* high surrogate = top 10 bits added to D800 */
2023 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002024
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002025 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002026 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 break;
2029
2030 default:
2031 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002032 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002033 startinpos = s-starts;
2034 endinpos = startinpos+n;
2035 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002040 utf8Error:
2041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
2045 starts, size, &startinpos, &endinpos, &exc, &s,
2046 &unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002074 Py_ssize_t size,
2075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Martin v. Löwis18e16552006-02-15 17:27:45 +00002079 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002080 PyObject *v; /* result string object */
2081 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002082 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2095 v = NULL; /* will allocate after we're done */
2096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002103 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002104 if (v == NULL)
2105 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002106 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Tim Peters602f7402002-04-27 18:03:26 +00002150 if (v == NULL) {
2151 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002154 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002157 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002158 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002163
Tim Peters602f7402002-04-27 18:03:26 +00002164#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165}
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 if (!PyUnicode_Check(unicode)) {
2170 PyErr_BadArgument();
2171 return NULL;
2172 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002173 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002174 PyUnicode_GET_SIZE(unicode),
2175 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176}
2177
Walter Dörwald6e390802007-08-17 16:41:28 +00002178/* --- UTF-32 Codec ------------------------------------------------------- */
2179
2180PyObject *
2181PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002182 Py_ssize_t size,
2183 const char *errors,
2184 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002185{
2186 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2187}
2188
2189PyObject *
2190PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002191 Py_ssize_t size,
2192 const char *errors,
2193 int *byteorder,
2194 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002195{
2196 const char *starts = s;
2197 Py_ssize_t startinpos;
2198 Py_ssize_t endinpos;
2199 Py_ssize_t outpos;
2200 PyUnicodeObject *unicode;
2201 Py_UNICODE *p;
2202#ifndef Py_UNICODE_WIDE
2203 int i, pairs;
2204#else
2205 const int pairs = 0;
2206#endif
2207 const unsigned char *q, *e;
2208 int bo = 0; /* assume native ordering by default */
2209 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002210 /* Offsets from q for retrieving bytes in the right order. */
2211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2212 int iorder[] = {0, 1, 2, 3};
2213#else
2214 int iorder[] = {3, 2, 1, 0};
2215#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002216 PyObject *errorHandler = NULL;
2217 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002218 /* On narrow builds we split characters outside the BMP into two
2219 codepoints => count how much extra space we need. */
2220#ifndef Py_UNICODE_WIDE
2221 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002222 if (((Py_UCS4 *)s)[i] >= 0x10000)
2223 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002224#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002225
2226 /* This might be one to much, because of a BOM */
2227 unicode = _PyUnicode_New((size+3)/4+pairs);
2228 if (!unicode)
2229 return NULL;
2230 if (size == 0)
2231 return (PyObject *)unicode;
2232
2233 /* Unpack UTF-32 encoded data */
2234 p = unicode->str;
2235 q = (unsigned char *)s;
2236 e = q + size;
2237
2238 if (byteorder)
2239 bo = *byteorder;
2240
2241 /* Check for BOM marks (U+FEFF) in the input and adjust current
2242 byte order setting accordingly. In native mode, the leading BOM
2243 mark is skipped, in all other modes, it is copied to the output
2244 stream as-is (giving a ZWNBSP character). */
2245 if (bo == 0) {
2246 if (size >= 4) {
2247 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002248 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002249#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002250 if (bom == 0x0000FEFF) {
2251 q += 4;
2252 bo = -1;
2253 }
2254 else if (bom == 0xFFFE0000) {
2255 q += 4;
2256 bo = 1;
2257 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002258#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002259 if (bom == 0x0000FEFF) {
2260 q += 4;
2261 bo = 1;
2262 }
2263 else if (bom == 0xFFFE0000) {
2264 q += 4;
2265 bo = -1;
2266 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002267#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002268 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002269 }
2270
2271 if (bo == -1) {
2272 /* force LE */
2273 iorder[0] = 0;
2274 iorder[1] = 1;
2275 iorder[2] = 2;
2276 iorder[3] = 3;
2277 }
2278 else if (bo == 1) {
2279 /* force BE */
2280 iorder[0] = 3;
2281 iorder[1] = 2;
2282 iorder[2] = 1;
2283 iorder[3] = 0;
2284 }
2285
2286 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002287 Py_UCS4 ch;
2288 /* remaining bytes at the end? (size should be divisible by 4) */
2289 if (e-q<4) {
2290 if (consumed)
2291 break;
2292 errmsg = "truncated data";
2293 startinpos = ((const char *)q)-starts;
2294 endinpos = ((const char *)e)-starts;
2295 goto utf32Error;
2296 /* The remaining input chars are ignored if the callback
2297 chooses to skip the input */
2298 }
2299 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2300 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002301
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002302 if (ch >= 0x110000)
2303 {
2304 errmsg = "codepoint not in range(0x110000)";
2305 startinpos = ((const char *)q)-starts;
2306 endinpos = startinpos+4;
2307 goto utf32Error;
2308 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002309#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002310 if (ch >= 0x10000)
2311 {
2312 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2313 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 }
2315 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002316#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002317 *p++ = ch;
2318 q += 4;
2319 continue;
2320 utf32Error:
2321 outpos = p-PyUnicode_AS_UNICODE(unicode);
2322 if (unicode_decode_call_errorhandler(
2323 errors, &errorHandler,
2324 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002325 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002326 &unicode, &outpos, &p))
2327 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002328 }
2329
2330 if (byteorder)
2331 *byteorder = bo;
2332
2333 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002334 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002335
2336 /* Adjust length */
2337 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2338 goto onError;
2339
2340 Py_XDECREF(errorHandler);
2341 Py_XDECREF(exc);
2342 return (PyObject *)unicode;
2343
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002345 Py_DECREF(unicode);
2346 Py_XDECREF(errorHandler);
2347 Py_XDECREF(exc);
2348 return NULL;
2349}
2350
2351PyObject *
2352PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002353 Py_ssize_t size,
2354 const char *errors,
2355 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002356{
2357 PyObject *v;
2358 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002359 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002360#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002361 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002362#else
2363 const int pairs = 0;
2364#endif
2365 /* Offsets from p for storing byte pairs in the right order. */
2366#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2367 int iorder[] = {0, 1, 2, 3};
2368#else
2369 int iorder[] = {3, 2, 1, 0};
2370#endif
2371
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002372#define STORECHAR(CH) \
2373 do { \
2374 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2375 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2376 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2377 p[iorder[0]] = (CH) & 0xff; \
2378 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002379 } while(0)
2380
2381 /* In narrow builds we can output surrogate pairs as one codepoint,
2382 so we need less space. */
2383#ifndef Py_UNICODE_WIDE
2384 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002385 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2386 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2387 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002388#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002389 nsize = (size - pairs + (byteorder == 0));
2390 bytesize = nsize * 4;
2391 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002392 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002393 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002394 if (v == NULL)
2395 return NULL;
2396
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002397 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002398 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002400 if (size == 0)
2401 return v;
2402
2403 if (byteorder == -1) {
2404 /* force LE */
2405 iorder[0] = 0;
2406 iorder[1] = 1;
2407 iorder[2] = 2;
2408 iorder[3] = 3;
2409 }
2410 else if (byteorder == 1) {
2411 /* force BE */
2412 iorder[0] = 3;
2413 iorder[1] = 2;
2414 iorder[2] = 1;
2415 iorder[3] = 0;
2416 }
2417
2418 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002419 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002420#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002421 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2422 Py_UCS4 ch2 = *s;
2423 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2424 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2425 s++;
2426 size--;
2427 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002428 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002429#endif
2430 STORECHAR(ch);
2431 }
2432 return v;
2433#undef STORECHAR
2434}
2435
2436PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437{
2438 if (!PyUnicode_Check(unicode)) {
2439 PyErr_BadArgument();
2440 return NULL;
2441 }
2442 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002443 PyUnicode_GET_SIZE(unicode),
2444 NULL,
2445 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002446}
2447
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448/* --- UTF-16 Codec ------------------------------------------------------- */
2449
Tim Peters772747b2001-08-09 22:21:55 +00002450PyObject *
2451PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002452 Py_ssize_t size,
2453 const char *errors,
2454 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455{
Walter Dörwald69652032004-09-07 20:24:22 +00002456 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2457}
2458
2459PyObject *
2460PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002461 Py_ssize_t size,
2462 const char *errors,
2463 int *byteorder,
2464 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002467 Py_ssize_t startinpos;
2468 Py_ssize_t endinpos;
2469 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 PyUnicodeObject *unicode;
2471 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002472 const unsigned char *q, *e;
2473 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002475 /* Offsets from q for retrieving byte pairs in the right order. */
2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2477 int ihi = 1, ilo = 0;
2478#else
2479 int ihi = 0, ilo = 1;
2480#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 PyObject *errorHandler = NULL;
2482 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 /* Note: size will always be longer than the resulting Unicode
2485 character count */
2486 unicode = _PyUnicode_New(size);
2487 if (!unicode)
2488 return NULL;
2489 if (size == 0)
2490 return (PyObject *)unicode;
2491
2492 /* Unpack UTF-16 encoded data */
2493 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002494 q = (unsigned char *)s;
2495 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496
2497 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002498 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002500 /* Check for BOM marks (U+FEFF) in the input and adjust current
2501 byte order setting accordingly. In native mode, the leading BOM
2502 mark is skipped, in all other modes, it is copied to the output
2503 stream as-is (giving a ZWNBSP character). */
2504 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002505 if (size >= 2) {
2506 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002508 if (bom == 0xFEFF) {
2509 q += 2;
2510 bo = -1;
2511 }
2512 else if (bom == 0xFFFE) {
2513 q += 2;
2514 bo = 1;
2515 }
Tim Petersced69f82003-09-16 20:30:58 +00002516#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002517 if (bom == 0xFEFF) {
2518 q += 2;
2519 bo = 1;
2520 }
2521 else if (bom == 0xFFFE) {
2522 q += 2;
2523 bo = -1;
2524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002525#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002526 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Tim Peters772747b2001-08-09 22:21:55 +00002529 if (bo == -1) {
2530 /* force LE */
2531 ihi = 1;
2532 ilo = 0;
2533 }
2534 else if (bo == 1) {
2535 /* force BE */
2536 ihi = 0;
2537 ilo = 1;
2538 }
2539
2540 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002541 Py_UNICODE ch;
2542 /* remaining bytes at the end? (size should be even) */
2543 if (e-q<2) {
2544 if (consumed)
2545 break;
2546 errmsg = "truncated data";
2547 startinpos = ((const char *)q)-starts;
2548 endinpos = ((const char *)e)-starts;
2549 goto utf16Error;
2550 /* The remaining input chars are ignored if the callback
2551 chooses to skip the input */
2552 }
2553 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554
Benjamin Peterson857ce152009-01-31 16:29:18 +00002555 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002556
2557 if (ch < 0xD800 || ch > 0xDFFF) {
2558 *p++ = ch;
2559 continue;
2560 }
2561
2562 /* UTF-16 code pair: */
2563 if (q >= e) {
2564 errmsg = "unexpected end of data";
2565 startinpos = (((const char *)q)-2)-starts;
2566 endinpos = ((const char *)e)-starts;
2567 goto utf16Error;
2568 }
2569 if (0xD800 <= ch && ch <= 0xDBFF) {
2570 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2571 q += 2;
2572 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002573#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002574 *p++ = ch;
2575 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002579 continue;
2580 }
2581 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002582 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002583 startinpos = (((const char *)q)-4)-starts;
2584 endinpos = startinpos+2;
2585 goto utf16Error;
2586 }
2587
Benjamin Peterson857ce152009-01-31 16:29:18 +00002588 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 errmsg = "illegal encoding";
2590 startinpos = (((const char *)q)-2)-starts;
2591 endinpos = startinpos+2;
2592 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002593
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002594 utf16Error:
2595 outpos = p-PyUnicode_AS_UNICODE(unicode);
2596 if (unicode_decode_call_errorhandler(
2597 errors, &errorHandler,
2598 "utf16", errmsg,
2599 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2600 &unicode, &outpos, &p))
2601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603
2604 if (byteorder)
2605 *byteorder = bo;
2606
Walter Dörwald69652032004-09-07 20:24:22 +00002607 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002608 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002609
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002611 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 goto onError;
2613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 Py_XDECREF(errorHandler);
2615 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 return (PyObject *)unicode;
2617
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002618 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 Py_XDECREF(errorHandler);
2621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 return NULL;
2623}
2624
Tim Peters772747b2001-08-09 22:21:55 +00002625PyObject *
2626PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002627 Py_ssize_t size,
2628 const char *errors,
2629 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630{
2631 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002632 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002633 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002634#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002635 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002636#else
2637 const int pairs = 0;
2638#endif
Tim Peters772747b2001-08-09 22:21:55 +00002639 /* Offsets from p for storing byte pairs in the right order. */
2640#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2641 int ihi = 1, ilo = 0;
2642#else
2643 int ihi = 0, ilo = 1;
2644#endif
2645
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002646#define STORECHAR(CH) \
2647 do { \
2648 p[ihi] = ((CH) >> 8) & 0xff; \
2649 p[ilo] = (CH) & 0xff; \
2650 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002651 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002653#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002654 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002655 if (s[i] >= 0x10000)
2656 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002657#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002658 /* 2 * (size + pairs + (byteorder == 0)) */
2659 if (size > PY_SSIZE_T_MAX ||
2660 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002661 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002662 nsize = size + pairs + (byteorder == 0);
2663 bytesize = nsize * 2;
2664 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002666 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667 if (v == NULL)
2668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002670 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002673 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002674 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002675
2676 if (byteorder == -1) {
2677 /* force LE */
2678 ihi = 1;
2679 ilo = 0;
2680 }
2681 else if (byteorder == 1) {
2682 /* force BE */
2683 ihi = 0;
2684 ilo = 1;
2685 }
2686
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002687 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002688 Py_UNICODE ch = *s++;
2689 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002690#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002691 if (ch >= 0x10000) {
2692 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2693 ch = 0xD800 | ((ch-0x10000) >> 10);
2694 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002695#endif
Tim Peters772747b2001-08-09 22:21:55 +00002696 STORECHAR(ch);
2697 if (ch2)
2698 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002701#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702}
2703
2704PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2705{
2706 if (!PyUnicode_Check(unicode)) {
2707 PyErr_BadArgument();
2708 return NULL;
2709 }
2710 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002711 PyUnicode_GET_SIZE(unicode),
2712 NULL,
2713 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714}
2715
2716/* --- Unicode Escape Codec ----------------------------------------------- */
2717
Fredrik Lundh06d12682001-01-24 07:59:11 +00002718static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002719
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002721 Py_ssize_t size,
2722 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002725 Py_ssize_t startinpos;
2726 Py_ssize_t endinpos;
2727 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002732 char* message;
2733 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 PyObject *errorHandler = NULL;
2735 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 /* Escaped strings will always be longer than the resulting
2738 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 length after conversion to the true value.
2740 (but if the error callback returns a long replacement string
2741 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 v = _PyUnicode_New(size);
2743 if (v == NULL)
2744 goto onError;
2745 if (size == 0)
2746 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002750
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 while (s < end) {
2752 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002753 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755
2756 /* Non-escape characters are interpreted as Unicode ordinals */
2757 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 continue;
2760 }
2761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 /* \ - Escapes */
2764 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002765 c = *s++;
2766 if (s > end)
2767 c = '\0'; /* Invalid after \ */
2768 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002770 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 case '\n': break;
2772 case '\\': *p++ = '\\'; break;
2773 case '\'': *p++ = '\''; break;
2774 case '\"': *p++ = '\"'; break;
2775 case 'b': *p++ = '\b'; break;
2776 case 'f': *p++ = '\014'; break; /* FF */
2777 case 't': *p++ = '\t'; break;
2778 case 'n': *p++ = '\n'; break;
2779 case 'r': *p++ = '\r'; break;
2780 case 'v': *p++ = '\013'; break; /* VT */
2781 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2782
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002783 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 case '0': case '1': case '2': case '3':
2785 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002786 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002787 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002788 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002789 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002790 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002792 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 break;
2794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002795 /* hex escapes */
2796 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 digits = 2;
2799 message = "truncated \\xXX escape";
2800 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002802 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002804 digits = 4;
2805 message = "truncated \\uXXXX escape";
2806 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002808 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002809 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810 digits = 8;
2811 message = "truncated \\UXXXXXXXX escape";
2812 hexescape:
2813 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 outpos = p-PyUnicode_AS_UNICODE(v);
2815 if (s+digits>end) {
2816 endinpos = size;
2817 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002818 errors, &errorHandler,
2819 "unicodeescape", "end of string in escape sequence",
2820 starts, size, &startinpos, &endinpos, &exc, &s,
2821 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 goto onError;
2823 goto nextByte;
2824 }
2825 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002826 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002827 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 endinpos = (s+i+1)-starts;
2829 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002830 errors, &errorHandler,
2831 "unicodeescape", message,
2832 starts, size, &startinpos, &endinpos, &exc, &s,
2833 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002836 }
2837 chr = (chr<<4) & ~0xF;
2838 if (c >= '0' && c <= '9')
2839 chr += c - '0';
2840 else if (c >= 'a' && c <= 'f')
2841 chr += 10 + c - 'a';
2842 else
2843 chr += 10 + c - 'A';
2844 }
2845 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002846 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 /* _decoding_error will have already written into the
2848 target buffer. */
2849 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002850 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002851 /* when we get here, chr is a 32-bit unicode character */
2852 if (chr <= 0xffff)
2853 /* UCS-2 character */
2854 *p++ = (Py_UNICODE) chr;
2855 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002856 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002857 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002858#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002859 *p++ = chr;
2860#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002861 chr -= 0x10000L;
2862 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002863 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002864#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002865 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 endinpos = s-starts;
2867 outpos = p-PyUnicode_AS_UNICODE(v);
2868 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002869 errors, &errorHandler,
2870 "unicodeescape", "illegal Unicode character",
2871 starts, size, &startinpos, &endinpos, &exc, &s,
2872 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002873 goto onError;
2874 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875 break;
2876
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002877 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 case 'N':
2879 message = "malformed \\N character escape";
2880 if (ucnhash_CAPI == NULL) {
2881 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002882 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002883 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 if (m == NULL)
2885 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002886 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002888 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002890 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002891 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 if (ucnhash_CAPI == NULL)
2893 goto ucnhashError;
2894 }
2895 if (*s == '{') {
2896 const char *start = s+1;
2897 /* look for the closing brace */
2898 while (*s != '}' && s < end)
2899 s++;
2900 if (s > start && s < end && *s == '}') {
2901 /* found a name. look it up in the unicode database */
2902 message = "unknown Unicode character name";
2903 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002904 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002905 goto store;
2906 }
2907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908 endinpos = s-starts;
2909 outpos = p-PyUnicode_AS_UNICODE(v);
2910 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002911 errors, &errorHandler,
2912 "unicodeescape", message,
2913 starts, size, &startinpos, &endinpos, &exc, &s,
2914 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 break;
2917
2918 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002919 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 message = "\\ at end of string";
2921 s--;
2922 endinpos = s-starts;
2923 outpos = p-PyUnicode_AS_UNICODE(v);
2924 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002925 errors, &errorHandler,
2926 "unicodeescape", message,
2927 starts, size, &startinpos, &endinpos, &exc, &s,
2928 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002929 goto onError;
2930 }
2931 else {
2932 *p++ = '\\';
2933 *p++ = (unsigned char)s[-1];
2934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002937 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002938 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002940 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002942 Py_XDECREF(errorHandler);
2943 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002945
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002946 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002947 PyErr_SetString(
2948 PyExc_UnicodeError,
2949 "\\N escapes not supported (can't load unicodedata module)"
2950 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 Py_XDECREF(errorHandler);
2953 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002954 return NULL;
2955
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002956 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 Py_XDECREF(errorHandler);
2959 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 return NULL;
2961}
2962
2963/* Return a Unicode-Escape string version of the Unicode object.
2964
2965 If quotes is true, the string is enclosed in u"" or u'' quotes as
2966 appropriate.
2967
2968*/
2969
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002970Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002971 Py_ssize_t size,
2972 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002973{
2974 /* like wcschr, but doesn't stop at NULL characters */
2975
2976 while (size-- > 0) {
2977 if (*s == ch)
2978 return s;
2979 s++;
2980 }
2981
2982 return NULL;
2983}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002984
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985static
2986PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002987 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 int quotes)
2989{
2990 PyObject *repr;
2991 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002993 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002994#ifdef Py_UNICODE_WIDE
2995 const Py_ssize_t expandsize = 10;
2996#else
2997 const Py_ssize_t expandsize = 6;
2998#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999
Neal Norwitz17753ec2006-08-21 22:21:19 +00003000 /* XXX(nnorwitz): rather than over-allocating, it would be
3001 better to choose a different scheme. Perhaps scan the
3002 first N-chars of the string and allocate based on that size.
3003 */
3004 /* Initial allocation is based on the longest-possible unichr
3005 escape.
3006
3007 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3008 unichr, so in this case it's the longest unichr escape. In
3009 narrow (UTF-16) builds this is five chars per source unichr
3010 since there are two unichrs in the surrogate pair, so in narrow
3011 (UTF-16) builds it's not the longest unichr escape.
3012
3013 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3014 so in the narrow (UTF-16) build case it's the longest unichr
3015 escape.
3016 */
3017
Neal Norwitze7d8be82008-07-31 17:17:14 +00003018 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003019 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003020
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003021 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003022 2
3023 + expandsize*size
3024 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 if (repr == NULL)
3026 return NULL;
3027
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003028 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
3030 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003032 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 !findchar(s, size, '"')) ? '"' : '\'';
3034 }
3035 while (size-- > 0) {
3036 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003037
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003038 /* Escape quotes and backslashes */
3039 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003040 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 *p++ = '\\';
3042 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003047 /* Map 21-bit characters to '\U00xxxxxx' */
3048 else if (ch >= 0x10000) {
3049 *p++ = '\\';
3050 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003051 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003058 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003060 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003061#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3063 else if (ch >= 0xD800 && ch < 0xDC00) {
3064 Py_UNICODE ch2;
3065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003066
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 ch2 = *s++;
3068 size--;
3069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3071 *p++ = '\\';
3072 *p++ = 'U';
3073 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3080 *p++ = hexdigit[ucs & 0x0000000F];
3081 continue;
3082 }
3083 /* Fall through: isolated surrogates are copied as-is */
3084 s--;
3085 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003086 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003088
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 *p++ = '\\';
3092 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093 *p++ = hexdigit[(ch >> 12) & 0x000F];
3094 *p++ = hexdigit[(ch >> 8) & 0x000F];
3095 *p++ = hexdigit[(ch >> 4) & 0x000F];
3096 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003099 /* Map special whitespace to '\t', \n', '\r' */
3100 else if (ch == '\t') {
3101 *p++ = '\\';
3102 *p++ = 't';
3103 }
3104 else if (ch == '\n') {
3105 *p++ = '\\';
3106 *p++ = 'n';
3107 }
3108 else if (ch == '\r') {
3109 *p++ = '\\';
3110 *p++ = 'r';
3111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003116 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003117 *p++ = hexdigit[(ch >> 4) & 0x000F];
3118 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 /* Copy everything else as-is */
3122 else
3123 *p++ = (char) ch;
3124 }
3125 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003126 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127
3128 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003129 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 return repr;
3131}
3132
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003134 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135{
3136 return unicodeescape_string(s, size, 0);
3137}
3138
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140{
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 return NULL;
3144 }
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003146 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147}
3148
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 Py_ssize_t size,
3153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t startinpos;
3157 Py_ssize_t endinpos;
3158 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 const char *end;
3162 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 PyObject *errorHandler = NULL;
3164 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Escaped strings will always be longer than the resulting
3167 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 length after conversion to the true value. (But decoding error
3169 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 v = _PyUnicode_New(size);
3171 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 end = s + size;
3177 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 unsigned char c;
3179 Py_UCS4 x;
3180 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003183 /* Non-escape characters are interpreted as Unicode ordinals */
3184 if (*s != '\\') {
3185 *p++ = (unsigned char)*s++;
3186 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003187 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003188 startinpos = s-starts;
3189
3190 /* \u-escapes are only interpreted iff the number of leading
3191 backslashes if odd */
3192 bs = s;
3193 for (;s < end;) {
3194 if (*s != '\\')
3195 break;
3196 *p++ = (unsigned char)*s++;
3197 }
3198 if (((s - bs) & 1) == 0 ||
3199 s >= end ||
3200 (*s != 'u' && *s != 'U')) {
3201 continue;
3202 }
3203 p--;
3204 count = *s=='u' ? 4 : 8;
3205 s++;
3206
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 for (x = 0, i = 0; i < count; ++i, ++s) {
3210 c = (unsigned char)*s;
3211 if (!isxdigit(c)) {
3212 endinpos = s-starts;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "rawunicodeescape", "truncated \\uXXXX",
3216 starts, size, &startinpos, &endinpos, &exc, &s,
3217 &v, &outpos, &p))
3218 goto onError;
3219 goto nextByte;
3220 }
3221 x = (x<<4) & ~0xF;
3222 if (c >= '0' && c <= '9')
3223 x += c - '0';
3224 else if (c >= 'a' && c <= 'f')
3225 x += 10 + c - 'a';
3226 else
3227 x += 10 + c - 'A';
3228 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003229 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003230 /* UCS-2 character */
3231 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003232 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* UCS-4 character. Either store directly, or as
3234 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 x -= 0x10000L;
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#endif
3242 } else {
3243 endinpos = s-starts;
3244 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003245 if (unicode_decode_call_errorhandler(
3246 errors, &errorHandler,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 starts, size, &startinpos, &endinpos, &exc, &s,
3249 &v, &outpos, &p))
3250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 nextByte:
3253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003260
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return NULL;
3266}
3267
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270{
3271 PyObject *repr;
3272 char *p;
3273 char *q;
3274
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003275 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003277 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003279 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003281
Neal Norwitze7d8be82008-07-31 17:17:14 +00003282 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003284
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (repr == NULL)
3287 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003288 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003291 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 while (size-- > 0) {
3293 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003294#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigit[(ch >> 28) & 0xf];
3300 *p++ = hexdigit[(ch >> 24) & 0xf];
3301 *p++ = hexdigit[(ch >> 20) & 0xf];
3302 *p++ = hexdigit[(ch >> 16) & 0xf];
3303 *p++ = hexdigit[(ch >> 12) & 0xf];
3304 *p++ = hexdigit[(ch >> 8) & 0xf];
3305 *p++ = hexdigit[(ch >> 4) & 0xf];
3306 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003307 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003308 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003309#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch >= 0xD800 && ch < 0xDC00) {
3312 Py_UNICODE ch2;
3313 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003315 ch2 = *s++;
3316 size--;
3317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319 *p++ = '\\';
3320 *p++ = 'U';
3321 *p++ = hexdigit[(ucs >> 28) & 0xf];
3322 *p++ = hexdigit[(ucs >> 24) & 0xf];
3323 *p++ = hexdigit[(ucs >> 20) & 0xf];
3324 *p++ = hexdigit[(ucs >> 16) & 0xf];
3325 *p++ = hexdigit[(ucs >> 12) & 0xf];
3326 *p++ = hexdigit[(ucs >> 8) & 0xf];
3327 *p++ = hexdigit[(ucs >> 4) & 0xf];
3328 *p++ = hexdigit[ucs & 0xf];
3329 continue;
3330 }
3331 /* Fall through: isolated surrogates are copied as-is */
3332 s--;
3333 size++;
3334 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003335#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 /* Map 16-bit characters to '\uxxxx' */
3337 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 *p++ = '\\';
3339 *p++ = 'u';
3340 *p++ = hexdigit[(ch >> 12) & 0xf];
3341 *p++ = hexdigit[(ch >> 8) & 0xf];
3342 *p++ = hexdigit[(ch >> 4) & 0xf];
3343 *p++ = hexdigit[ch & 15];
3344 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 /* Copy everything else as-is */
3346 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 *p++ = (char) ch;
3348 }
3349 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003350 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 return repr;
3352}
3353
3354PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355{
3356 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003357 PyErr_BadArgument();
3358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 }
3360 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003361 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362}
3363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364/* --- Unicode Internal Codec ------------------------------------------- */
3365
3366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003367 Py_ssize_t size,
3368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003369{
3370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 Py_ssize_t startinpos;
3372 Py_ssize_t endinpos;
3373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003374 PyUnicodeObject *v;
3375 Py_UNICODE *p;
3376 const char *end;
3377 const char *reason;
3378 PyObject *errorHandler = NULL;
3379 PyObject *exc = NULL;
3380
Neal Norwitzd43069c2006-01-08 01:12:10 +00003381#ifdef Py_UNICODE_WIDE
3382 Py_UNICODE unimax = PyUnicode_GetMax();
3383#endif
3384
Armin Rigo7ccbca92006-10-04 12:17:45 +00003385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3387 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003391 p = PyUnicode_AS_UNICODE(v);
3392 end = s + size;
3393
3394 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 /* We have to sanity check the raw data, otherwise doom looms for
3397 some malformed UCS-4 data. */
3398 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003400 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003402 end-s < Py_UNICODE_SIZE
3403 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003405 startinpos = s - starts;
3406 if (end-s < Py_UNICODE_SIZE) {
3407 endinpos = end-starts;
3408 reason = "truncated input";
3409 }
3410 else {
3411 endinpos = s - starts + Py_UNICODE_SIZE;
3412 reason = "illegal code point (> 0x10FFFF)";
3413 }
3414 outpos = p - PyUnicode_AS_UNICODE(v);
3415 if (unicode_decode_call_errorhandler(
3416 errors, &errorHandler,
3417 "unicode_internal", reason,
3418 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 goto onError;
3421 }
3422 }
3423 else {
3424 p++;
3425 s += Py_UNICODE_SIZE;
3426 }
3427 }
3428
Martin v. Löwis412fb672006-04-13 06:34:32 +00003429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003430 goto onError;
3431 Py_XDECREF(errorHandler);
3432 Py_XDECREF(exc);
3433 return (PyObject *)v;
3434
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 Py_XDECREF(v);
3437 Py_XDECREF(errorHandler);
3438 Py_XDECREF(exc);
3439 return NULL;
3440}
3441
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442/* --- Latin-1 Codec ------------------------------------------------------ */
3443
3444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 Py_ssize_t size,
3446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
3448 PyUnicodeObject *v;
3449 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003450
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003452 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003453 Py_UNICODE r = *(unsigned char*)s;
3454 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003455 }
3456
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 v = _PyUnicode_New(size);
3458 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003459 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 p = PyUnicode_AS_UNICODE(v);
3463 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 Py_XDECREF(v);
3469 return NULL;
3470}
3471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472/* create or adjust a UnicodeEncodeError */
3473static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 const char *encoding,
3475 const Py_UNICODE *unicode, Py_ssize_t size,
3476 Py_ssize_t startpos, Py_ssize_t endpos,
3477 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003480 *exceptionObject = PyUnicodeEncodeError_Create(
3481 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 }
3483 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003484 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3485 goto onError;
3486 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3487 goto onError;
3488 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3489 goto onError;
3490 return;
3491 onError:
3492 Py_DECREF(*exceptionObject);
3493 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 }
3495}
3496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497/* raises a UnicodeEncodeError */
3498static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 const char *encoding,
3500 const Py_UNICODE *unicode, Py_ssize_t size,
3501 Py_ssize_t startpos, Py_ssize_t endpos,
3502 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503{
3504 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508}
3509
3510/* error handling callback helper:
3511 build arguments, call the callback and check the arguments,
3512 put the result into newpos and return the replacement string, which
3513 has to be freed by the caller */
3514static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 PyObject **errorHandler,
3516 const char *encoding, const char *reason,
3517 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3518 Py_ssize_t startpos, Py_ssize_t endpos,
3519 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522
3523 PyObject *restuple;
3524 PyObject *resunicode;
3525
3526 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 }
3531
3532 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536
3537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003542 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 Py_DECREF(restuple);
3544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 }
3546 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 &resunicode, newpos)) {
3548 Py_DECREF(restuple);
3549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 }
3551 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003553 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 Py_INCREF(resunicode);
3559 Py_DECREF(restuple);
3560 return resunicode;
3561}
3562
3563static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 Py_ssize_t size,
3565 const char *errors,
3566 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567{
3568 /* output object */
3569 PyObject *res;
3570 /* pointers to the beginning and end+1 of input */
3571 const Py_UNICODE *startp = p;
3572 const Py_UNICODE *endp = p + size;
3573 /* pointer to the beginning of the unencodable characters */
3574 /* const Py_UNICODE *badp = NULL; */
3575 /* pointer into the output */
3576 char *str;
3577 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003578 Py_ssize_t respos = 0;
3579 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003580 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3581 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 PyObject *errorHandler = NULL;
3583 PyObject *exc = NULL;
3584 /* the following variable is used for caching string comparisons
3585 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3586 int known_errorHandler = -1;
3587
3588 /* allocate enough for a simple encoding without
3589 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003590 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (res == NULL)
3592 goto onError;
3593 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003594 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003595 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 ressize = size;
3597
3598 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 /* can we encode this? */
3602 if (c<limit) {
3603 /* no overflow check, because we know that the space is enough */
3604 *str++ = (char)c;
3605 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003606 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003607 else {
3608 Py_ssize_t unicodepos = p-startp;
3609 Py_ssize_t requiredsize;
3610 PyObject *repunicode;
3611 Py_ssize_t repsize;
3612 Py_ssize_t newpos;
3613 Py_ssize_t respos;
3614 Py_UNICODE *uni2;
3615 /* startpos for collecting unencodable chars */
3616 const Py_UNICODE *collstart = p;
3617 const Py_UNICODE *collend = p;
3618 /* find all unecodable characters */
3619 while ((collend < endp) && ((*collend)>=limit))
3620 ++collend;
3621 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 while (collstart++<collend)
3640 *str++ = '?'; /* fall through */
3641 case 3: /* ignore */
3642 p = collend;
3643 break;
3644 case 4: /* xmlcharrefreplace */
3645 respos = str-PyString_AS_STRING(res);
3646 /* determine replacement size (temporarily (mis)uses p) */
3647 for (p = collstart, repsize = 0; p < collend; ++p) {
3648 if (*p<10)
3649 repsize += 2+1+1;
3650 else if (*p<100)
3651 repsize += 2+2+1;
3652 else if (*p<1000)
3653 repsize += 2+3+1;
3654 else if (*p<10000)
3655 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003656#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003657 else
3658 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003659#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else if (*p<100000)
3661 repsize += 2+5+1;
3662 else if (*p<1000000)
3663 repsize += 2+6+1;
3664 else
3665 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003666#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 }
3668 requiredsize = respos+repsize+(endp-collend);
3669 if (requiredsize > ressize) {
3670 if (requiredsize<2*ressize)
3671 requiredsize = 2*ressize;
3672 if (_PyString_Resize(&res, requiredsize))
3673 goto onError;
3674 str = PyString_AS_STRING(res) + respos;
3675 ressize = requiredsize;
3676 }
3677 /* generate replacement (temporarily (mis)uses p) */
3678 for (p = collstart; p < collend; ++p) {
3679 str += sprintf(str, "&#%d;", (int)*p);
3680 }
3681 p = collend;
3682 break;
3683 default:
3684 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3685 encoding, reason, startp, size, &exc,
3686 collstart-startp, collend-startp, &newpos);
3687 if (repunicode == NULL)
3688 goto onError;
3689 /* need more space? (at least enough for what we have+the
3690 replacement+the rest of the string, so we won't have to
3691 check space for encodable characters) */
3692 respos = str-PyString_AS_STRING(res);
3693 repsize = PyUnicode_GET_SIZE(repunicode);
3694 requiredsize = respos+repsize+(endp-collend);
3695 if (requiredsize > ressize) {
3696 if (requiredsize<2*ressize)
3697 requiredsize = 2*ressize;
3698 if (_PyString_Resize(&res, requiredsize)) {
3699 Py_DECREF(repunicode);
3700 goto onError;
3701 }
3702 str = PyString_AS_STRING(res) + respos;
3703 ressize = requiredsize;
3704 }
3705 /* check if there is anything unencodable in the replacement
3706 and copy it to the output */
3707 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3708 c = *uni2;
3709 if (c >= limit) {
3710 raise_encode_exception(&exc, encoding, startp, size,
3711 unicodepos, unicodepos+1, reason);
3712 Py_DECREF(repunicode);
3713 goto onError;
3714 }
3715 *str = (char)c;
3716 }
3717 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003718 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 }
3721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003723 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 /* If this falls res will be NULL */
3726 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 Py_XDECREF(errorHandler);
3728 Py_XDECREF(exc);
3729 return res;
3730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 Py_XDECREF(res);
3733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
3735 return NULL;
3736}
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003739 Py_ssize_t size,
3740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743}
3744
3745PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746{
3747 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003748 PyErr_BadArgument();
3749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 }
3751 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003752 PyUnicode_GET_SIZE(unicode),
3753 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754}
3755
3756/* --- 7-bit ASCII Codec -------------------------------------------------- */
3757
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 Py_ssize_t size,
3760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
3764 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003765 Py_ssize_t startinpos;
3766 Py_ssize_t endinpos;
3767 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 const char *e;
3769 PyObject *errorHandler = NULL;
3770 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003771
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003773 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003774 Py_UNICODE r = *(unsigned char*)s;
3775 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776 }
Tim Petersced69f82003-09-16 20:30:58 +00003777
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 v = _PyUnicode_New(size);
3779 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003782 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 e = s + size;
3785 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 register unsigned char c = (unsigned char)*s;
3787 if (c < 128) {
3788 *p++ = c;
3789 ++s;
3790 }
3791 else {
3792 startinpos = s-starts;
3793 endinpos = startinpos + 1;
3794 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3795 if (unicode_decode_call_errorhandler(
3796 errors, &errorHandler,
3797 "ascii", "ordinal not in range(128)",
3798 starts, size, &startinpos, &endinpos, &exc, &s,
3799 &v, &outpos, &p))
3800 goto onError;
3801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003803 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003804 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3805 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_XDECREF(errorHandler);
3807 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003809
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 Py_XDECREF(errorHandler);
3813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 return NULL;
3815}
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003818 Py_ssize_t size,
3819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822}
3823
3824PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825{
3826 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003827 PyErr_BadArgument();
3828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 }
3830 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003831 PyUnicode_GET_SIZE(unicode),
3832 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833}
3834
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003835#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003836
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003837/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003838
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003839#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003840#define NEED_RETRY
3841#endif
3842
3843/* XXX This code is limited to "true" double-byte encodings, as
3844 a) it assumes an incomplete character consists of a single byte, and
3845 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847
3848static int is_dbcs_lead_byte(const char *s, int offset)
3849{
3850 const char *curr = s + offset;
3851
3852 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 const char *prev = CharPrev(s, curr);
3854 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003855 }
3856 return 0;
3857}
3858
3859/*
3860 * Decode MBCS string into unicode object. If 'final' is set, converts
3861 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 */
3863static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 const char *s, /* MBCS string */
3865 int size, /* sizeof MBCS string */
3866 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003867{
3868 Py_UNICODE *p;
3869 Py_ssize_t n = 0;
3870 int usize = 0;
3871
3872 assert(size >= 0);
3873
3874 /* Skip trailing lead-byte unless 'final' is set */
3875 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877
3878 /* First get the size of the result */
3879 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003880 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3881 if (usize == 0) {
3882 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3883 return -1;
3884 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003885 }
3886
3887 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003888 /* Create unicode object */
3889 *v = _PyUnicode_New(usize);
3890 if (*v == NULL)
3891 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 /* Extend unicode object */
3895 n = PyUnicode_GET_SIZE(*v);
3896 if (_PyUnicode_Resize(v, n + usize) < 0)
3897 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898 }
3899
3900 /* Do the conversion */
3901 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 p = PyUnicode_AS_UNICODE(*v) + n;
3903 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3904 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3905 return -1;
3906 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 }
3908
3909 return size;
3910}
3911
3912PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 Py_ssize_t size,
3914 const char *errors,
3915 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916{
3917 PyUnicodeObject *v = NULL;
3918 int done;
3919
3920 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922
3923#ifdef NEED_RETRY
3924 retry:
3925 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003926 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927 else
3928#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930
3931 if (done < 0) {
3932 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 }
3935
3936 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003937 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003938
3939#ifdef NEED_RETRY
3940 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003941 s += done;
3942 size -= done;
3943 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003944 }
3945#endif
3946
3947 return (PyObject *)v;
3948}
3949
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003951 Py_ssize_t size,
3952 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3955}
3956
3957/*
3958 * Convert unicode into string object (MBCS).
3959 * Returns 0 if succeed, -1 otherwise.
3960 */
3961static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003962 const Py_UNICODE *p, /* unicode */
3963 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964{
3965 int mbcssize = 0;
3966 Py_ssize_t n = 0;
3967
3968 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003969
3970 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003972 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3973 if (mbcssize == 0) {
3974 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3975 return -1;
3976 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003977 }
3978
Martin v. Löwisd8251432006-06-14 05:21:04 +00003979 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003980 /* Create string object */
3981 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3982 if (*repr == NULL)
3983 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003984 }
3985 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 /* Extend string object */
3987 n = PyString_Size(*repr);
3988 if (_PyString_Resize(repr, n + mbcssize) < 0)
3989 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990 }
3991
3992 /* Do the conversion */
3993 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003994 char *s = PyString_AS_STRING(*repr) + n;
3995 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3996 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3997 return -1;
3998 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 }
4000
4001 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004002}
4003
4004PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004005 Py_ssize_t size,
4006 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004007{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004008 PyObject *repr = NULL;
4009 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004010
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 else
4016#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004018
Martin v. Löwisd8251432006-06-14 05:21:04 +00004019 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 Py_XDECREF(repr);
4021 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004022 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004023
4024#ifdef NEED_RETRY
4025 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004026 p += INT_MAX;
4027 size -= INT_MAX;
4028 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029 }
4030#endif
4031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032 return repr;
4033}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004034
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004035PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036{
4037 if (!PyUnicode_Check(unicode)) {
4038 PyErr_BadArgument();
4039 return NULL;
4040 }
4041 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004042 PyUnicode_GET_SIZE(unicode),
4043 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004044}
4045
Martin v. Löwisd8251432006-06-14 05:21:04 +00004046#undef NEED_RETRY
4047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004048#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050/* --- Character Mapping Codec -------------------------------------------- */
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004053 Py_ssize_t size,
4054 PyObject *mapping,
4055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 Py_ssize_t startinpos;
4059 Py_ssize_t endinpos;
4060 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 PyUnicodeObject *v;
4063 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004064 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 PyObject *errorHandler = NULL;
4066 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 /* Default to Latin-1 */
4071 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073
4074 v = _PyUnicode_New(size);
4075 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004076 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004081 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 mapstring = PyUnicode_AS_UNICODE(mapping);
4083 maplen = PyUnicode_GET_SIZE(mapping);
4084 while (s < e) {
4085 unsigned char ch = *s;
4086 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004088 if (ch < maplen)
4089 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 if (x == 0xfffe) {
4092 /* undefined mapping */
4093 outpos = p-PyUnicode_AS_UNICODE(v);
4094 startinpos = s-starts;
4095 endinpos = startinpos+1;
4096 if (unicode_decode_call_errorhandler(
4097 errors, &errorHandler,
4098 "charmap", "character maps to <undefined>",
4099 starts, size, &startinpos, &endinpos, &exc, &s,
4100 &v, &outpos, &p)) {
4101 goto onError;
4102 }
4103 continue;
4104 }
4105 *p++ = x;
4106 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004107 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004108 }
4109 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004110 while (s < e) {
4111 unsigned char ch = *s;
4112 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004114 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4115 w = PyInt_FromLong((long)ch);
4116 if (w == NULL)
4117 goto onError;
4118 x = PyObject_GetItem(mapping, w);
4119 Py_DECREF(w);
4120 if (x == NULL) {
4121 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122 /* No mapping found means: mapping is undefined. */
4123 PyErr_Clear();
4124 x = Py_None;
4125 Py_INCREF(x);
4126 } else
4127 goto onError;
4128 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004129
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 /* Apply mapping */
4131 if (PyInt_Check(x)) {
4132 long value = PyInt_AS_LONG(x);
4133 if (value < 0 || value > 65535) {
4134 PyErr_SetString(PyExc_TypeError,
4135 "character mapping must be in range(65536)");
4136 Py_DECREF(x);
4137 goto onError;
4138 }
4139 *p++ = (Py_UNICODE)value;
4140 }
4141 else if (x == Py_None) {
4142 /* undefined mapping */
4143 outpos = p-PyUnicode_AS_UNICODE(v);
4144 startinpos = s-starts;
4145 endinpos = startinpos+1;
4146 if (unicode_decode_call_errorhandler(
4147 errors, &errorHandler,
4148 "charmap", "character maps to <undefined>",
4149 starts, size, &startinpos, &endinpos, &exc, &s,
4150 &v, &outpos, &p)) {
4151 Py_DECREF(x);
4152 goto onError;
4153 }
4154 Py_DECREF(x);
4155 continue;
4156 }
4157 else if (PyUnicode_Check(x)) {
4158 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004159
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004160 if (targetsize == 1)
4161 /* 1-1 mapping */
4162 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004163
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004164 else if (targetsize > 1) {
4165 /* 1-n mapping */
4166 if (targetsize > extrachars) {
4167 /* resize first */
4168 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4169 Py_ssize_t needed = (targetsize - extrachars) + \
4170 (targetsize << 2);
4171 extrachars += needed;
4172 /* XXX overflow detection missing */
4173 if (_PyUnicode_Resize(&v,
4174 PyUnicode_GET_SIZE(v) + needed) < 0) {
4175 Py_DECREF(x);
4176 goto onError;
4177 }
4178 p = PyUnicode_AS_UNICODE(v) + oldpos;
4179 }
4180 Py_UNICODE_COPY(p,
4181 PyUnicode_AS_UNICODE(x),
4182 targetsize);
4183 p += targetsize;
4184 extrachars -= targetsize;
4185 }
4186 /* 1-0 mapping: skip the character */
4187 }
4188 else {
4189 /* wrong return value */
4190 PyErr_SetString(PyExc_TypeError,
4191 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004192 Py_DECREF(x);
4193 goto onError;
4194 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 Py_DECREF(x);
4196 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 }
4199 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004200 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4201 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_XDECREF(errorHandler);
4203 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004205
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004206 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 Py_XDECREF(errorHandler);
4208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_XDECREF(v);
4210 return NULL;
4211}
4212
Martin v. Löwis3f767792006-06-04 19:36:28 +00004213/* Charmap encoding: the lookup table */
4214
4215struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216 PyObject_HEAD
4217 unsigned char level1[32];
4218 int count2, count3;
4219 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220};
4221
4222static PyObject*
4223encoding_map_size(PyObject *obj, PyObject* args)
4224{
4225 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004226 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227 128*map->count3);
4228}
4229
4230static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004231 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 PyDoc_STR("Return the size (in bytes) of this object") },
4233 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004234};
4235
4236static void
4237encoding_map_dealloc(PyObject* o)
4238{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004239 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004240}
4241
4242static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004243 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004244 "EncodingMap", /*tp_name*/
4245 sizeof(struct encoding_map), /*tp_basicsize*/
4246 0, /*tp_itemsize*/
4247 /* methods */
4248 encoding_map_dealloc, /*tp_dealloc*/
4249 0, /*tp_print*/
4250 0, /*tp_getattr*/
4251 0, /*tp_setattr*/
4252 0, /*tp_compare*/
4253 0, /*tp_repr*/
4254 0, /*tp_as_number*/
4255 0, /*tp_as_sequence*/
4256 0, /*tp_as_mapping*/
4257 0, /*tp_hash*/
4258 0, /*tp_call*/
4259 0, /*tp_str*/
4260 0, /*tp_getattro*/
4261 0, /*tp_setattro*/
4262 0, /*tp_as_buffer*/
4263 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4264 0, /*tp_doc*/
4265 0, /*tp_traverse*/
4266 0, /*tp_clear*/
4267 0, /*tp_richcompare*/
4268 0, /*tp_weaklistoffset*/
4269 0, /*tp_iter*/
4270 0, /*tp_iternext*/
4271 encoding_map_methods, /*tp_methods*/
4272 0, /*tp_members*/
4273 0, /*tp_getset*/
4274 0, /*tp_base*/
4275 0, /*tp_dict*/
4276 0, /*tp_descr_get*/
4277 0, /*tp_descr_set*/
4278 0, /*tp_dictoffset*/
4279 0, /*tp_init*/
4280 0, /*tp_alloc*/
4281 0, /*tp_new*/
4282 0, /*tp_free*/
4283 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004284};
4285
4286PyObject*
4287PyUnicode_BuildEncodingMap(PyObject* string)
4288{
4289 Py_UNICODE *decode;
4290 PyObject *result;
4291 struct encoding_map *mresult;
4292 int i;
4293 int need_dict = 0;
4294 unsigned char level1[32];
4295 unsigned char level2[512];
4296 unsigned char *mlevel1, *mlevel2, *mlevel3;
4297 int count2 = 0, count3 = 0;
4298
4299 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4300 PyErr_BadArgument();
4301 return NULL;
4302 }
4303 decode = PyUnicode_AS_UNICODE(string);
4304 memset(level1, 0xFF, sizeof level1);
4305 memset(level2, 0xFF, sizeof level2);
4306
4307 /* If there isn't a one-to-one mapping of NULL to \0,
4308 or if there are non-BMP characters, we need to use
4309 a mapping dictionary. */
4310 if (decode[0] != 0)
4311 need_dict = 1;
4312 for (i = 1; i < 256; i++) {
4313 int l1, l2;
4314 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004315#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004316 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004317#endif
4318 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 need_dict = 1;
4320 break;
4321 }
4322 if (decode[i] == 0xFFFE)
4323 /* unmapped character */
4324 continue;
4325 l1 = decode[i] >> 11;
4326 l2 = decode[i] >> 7;
4327 if (level1[l1] == 0xFF)
4328 level1[l1] = count2++;
4329 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004330 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004331 }
4332
4333 if (count2 >= 0xFF || count3 >= 0xFF)
4334 need_dict = 1;
4335
4336 if (need_dict) {
4337 PyObject *result = PyDict_New();
4338 PyObject *key, *value;
4339 if (!result)
4340 return NULL;
4341 for (i = 0; i < 256; i++) {
4342 key = value = NULL;
4343 key = PyInt_FromLong(decode[i]);
4344 value = PyInt_FromLong(i);
4345 if (!key || !value)
4346 goto failed1;
4347 if (PyDict_SetItem(result, key, value) == -1)
4348 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004349 Py_DECREF(key);
4350 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004351 }
4352 return result;
4353 failed1:
4354 Py_XDECREF(key);
4355 Py_XDECREF(value);
4356 Py_DECREF(result);
4357 return NULL;
4358 }
4359
4360 /* Create a three-level trie */
4361 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4362 16*count2 + 128*count3 - 1);
4363 if (!result)
4364 return PyErr_NoMemory();
4365 PyObject_Init(result, &EncodingMapType);
4366 mresult = (struct encoding_map*)result;
4367 mresult->count2 = count2;
4368 mresult->count3 = count3;
4369 mlevel1 = mresult->level1;
4370 mlevel2 = mresult->level23;
4371 mlevel3 = mresult->level23 + 16*count2;
4372 memcpy(mlevel1, level1, 32);
4373 memset(mlevel2, 0xFF, 16*count2);
4374 memset(mlevel3, 0, 128*count3);
4375 count3 = 0;
4376 for (i = 1; i < 256; i++) {
4377 int o1, o2, o3, i2, i3;
4378 if (decode[i] == 0xFFFE)
4379 /* unmapped character */
4380 continue;
4381 o1 = decode[i]>>11;
4382 o2 = (decode[i]>>7) & 0xF;
4383 i2 = 16*mlevel1[o1] + o2;
4384 if (mlevel2[i2] == 0xFF)
4385 mlevel2[i2] = count3++;
4386 o3 = decode[i] & 0x7F;
4387 i3 = 128*mlevel2[i2] + o3;
4388 mlevel3[i3] = i;
4389 }
4390 return result;
4391}
4392
4393static int
4394encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4395{
4396 struct encoding_map *map = (struct encoding_map*)mapping;
4397 int l1 = c>>11;
4398 int l2 = (c>>7) & 0xF;
4399 int l3 = c & 0x7F;
4400 int i;
4401
4402#ifdef Py_UNICODE_WIDE
4403 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004404 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 }
4406#endif
4407 if (c == 0)
4408 return 0;
4409 /* level 1*/
4410 i = map->level1[l1];
4411 if (i == 0xFF) {
4412 return -1;
4413 }
4414 /* level 2*/
4415 i = map->level23[16*i+l2];
4416 if (i == 0xFF) {
4417 return -1;
4418 }
4419 /* level 3 */
4420 i = map->level23[16*map->count2 + 128*i + l3];
4421 if (i == 0) {
4422 return -1;
4423 }
4424 return i;
4425}
4426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427/* Lookup the character ch in the mapping. If the character
4428 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004429 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 PyObject *w = PyInt_FromLong((long)c);
4433 PyObject *x;
4434
4435 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 x = PyObject_GetItem(mapping, w);
4438 Py_DECREF(w);
4439 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004440 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4441 /* No mapping found means: mapping is undefined. */
4442 PyErr_Clear();
4443 x = Py_None;
4444 Py_INCREF(x);
4445 return x;
4446 } else
4447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004449 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004450 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004452 long value = PyInt_AS_LONG(x);
4453 if (value < 0 || value > 255) {
4454 PyErr_SetString(PyExc_TypeError,
4455 "character mapping must be in range(256)");
4456 Py_DECREF(x);
4457 return NULL;
4458 }
4459 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004461 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004464 /* wrong return value */
4465 PyErr_SetString(PyExc_TypeError,
4466 "character mapping must return integer, None or str");
4467 Py_DECREF(x);
4468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
4470}
4471
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472static int
4473charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4474{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004475 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4476 /* exponentially overallocate to minimize reallocations */
4477 if (requiredsize < 2*outsize)
4478 requiredsize = 2*outsize;
4479 if (_PyString_Resize(outobj, requiredsize)) {
4480 return 0;
4481 }
4482 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004483}
4484
Benjamin Peterson857ce152009-01-31 16:29:18 +00004485typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004486 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004487}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488/* lookup the character, put the result in the output string and adjust
4489 various state variables. Reallocate the output string if not enough
4490 space is available. Return a new reference to the object that
4491 was put in the output buffer, or Py_None, if the mapping was undefined
4492 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004493 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004495charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004496 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498 PyObject *rep;
4499 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004500 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501
Christian Heimese93237d2007-12-19 02:37:44 +00004502 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004503 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004504 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 if (res == -1)
4506 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004507 if (outsize<requiredsize)
4508 if (!charmapencode_resize(outobj, outpos, requiredsize))
4509 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004510 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004511 outstart[(*outpos)++] = (char)res;
4512 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004513 }
4514
4515 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004517 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004518 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004519 Py_DECREF(rep);
4520 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004521 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 if (PyInt_Check(rep)) {
4523 Py_ssize_t requiredsize = *outpos+1;
4524 if (outsize<requiredsize)
4525 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4526 Py_DECREF(rep);
4527 return enc_EXCEPTION;
4528 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004529 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004530 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004531 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 else {
4533 const char *repchars = PyString_AS_STRING(rep);
4534 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4535 Py_ssize_t requiredsize = *outpos+repsize;
4536 if (outsize<requiredsize)
4537 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4538 Py_DECREF(rep);
4539 return enc_EXCEPTION;
4540 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004541 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 memcpy(outstart + *outpos, repchars, repsize);
4543 *outpos += repsize;
4544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 }
Georg Brandl9f167602006-06-04 21:46:16 +00004546 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004547 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548}
4549
4550/* handle an error in PyUnicode_EncodeCharmap
4551 Return 0 on success, -1 on error */
4552static
4553int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004556 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558{
4559 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004560 Py_ssize_t repsize;
4561 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 Py_UNICODE *uni2;
4563 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 Py_ssize_t collstartpos = *inpos;
4565 Py_ssize_t collendpos = *inpos+1;
4566 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 char *encoding = "charmap";
4568 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 /* find all unencodable characters */
4572 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004573 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004574 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004575 int res = encoding_map_lookup(p[collendpos], mapping);
4576 if (res != -1)
4577 break;
4578 ++collendpos;
4579 continue;
4580 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004581
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004582 rep = charmapencode_lookup(p[collendpos], mapping);
4583 if (rep==NULL)
4584 return -1;
4585 else if (rep!=Py_None) {
4586 Py_DECREF(rep);
4587 break;
4588 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004589 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004590 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 }
4592 /* cache callback name lookup
4593 * (if not done yet, i.e. it's the first error) */
4594 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 if ((errors==NULL) || (!strcmp(errors, "strict")))
4596 *known_errorHandler = 1;
4597 else if (!strcmp(errors, "replace"))
4598 *known_errorHandler = 2;
4599 else if (!strcmp(errors, "ignore"))
4600 *known_errorHandler = 3;
4601 else if (!strcmp(errors, "xmlcharrefreplace"))
4602 *known_errorHandler = 4;
4603 else
4604 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 }
4606 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004607 case 1: /* strict */
4608 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4609 return -1;
4610 case 2: /* replace */
4611 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 x = charmapencode_output('?', mapping, res, respos);
4613 if (x==enc_EXCEPTION) {
4614 return -1;
4615 }
4616 else if (x==enc_FAILED) {
4617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618 return -1;
4619 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004620 }
4621 /* fall through */
4622 case 3: /* ignore */
4623 *inpos = collendpos;
4624 break;
4625 case 4: /* xmlcharrefreplace */
4626 /* generate replacement (temporarily (mis)uses p) */
4627 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004628 char buffer[2+29+1+1];
4629 char *cp;
4630 sprintf(buffer, "&#%d;", (int)p[collpos]);
4631 for (cp = buffer; *cp; ++cp) {
4632 x = charmapencode_output(*cp, mapping, res, respos);
4633 if (x==enc_EXCEPTION)
4634 return -1;
4635 else if (x==enc_FAILED) {
4636 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4637 return -1;
4638 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004639 }
4640 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004641 *inpos = collendpos;
4642 break;
4643 default:
4644 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004645 encoding, reason, p, size, exceptionObject,
4646 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004647 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004648 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004649 /* generate replacement */
4650 repsize = PyUnicode_GET_SIZE(repunicode);
4651 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004652 x = charmapencode_output(*uni2, mapping, res, respos);
4653 if (x==enc_EXCEPTION) {
4654 return -1;
4655 }
4656 else if (x==enc_FAILED) {
4657 Py_DECREF(repunicode);
4658 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4659 return -1;
4660 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004661 }
4662 *inpos = newpos;
4663 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 }
4665 return 0;
4666}
4667
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004669 Py_ssize_t size,
4670 PyObject *mapping,
4671 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 /* output object */
4674 PyObject *res = NULL;
4675 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004678 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 PyObject *errorHandler = NULL;
4680 PyObject *exc = NULL;
4681 /* the following variable is used for caching string comparisons
4682 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4683 * 3=ignore, 4=xmlcharrefreplace */
4684 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686 /* Default to Latin-1 */
4687 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004688 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 /* allocate enough for a simple encoding without
4691 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004692 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 if (res == NULL)
4694 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004695 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004696 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 /* try to encode it */
4700 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4701 if (x==enc_EXCEPTION) /* error */
4702 goto onError;
4703 if (x==enc_FAILED) { /* unencodable character */
4704 if (charmap_encoding_error(p, size, &inpos, mapping,
4705 &exc,
4706 &known_errorHandler, &errorHandler, errors,
4707 &res, &respos)) {
4708 goto onError;
4709 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004710 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 else
4712 /* done with this character => adjust input position */
4713 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004717 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004718 if (_PyString_Resize(&res, respos))
4719 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 }
4721 Py_XDECREF(exc);
4722 Py_XDECREF(errorHandler);
4723 return res;
4724
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 Py_XDECREF(res);
4727 Py_XDECREF(exc);
4728 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 return NULL;
4730}
4731
4732PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004733 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734{
4735 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004736 PyErr_BadArgument();
4737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 }
4739 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 PyUnicode_GET_SIZE(unicode),
4741 mapping,
4742 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743}
4744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745/* create or adjust a UnicodeTranslateError */
4746static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 const Py_UNICODE *unicode, Py_ssize_t size,
4748 Py_ssize_t startpos, Py_ssize_t endpos,
4749 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004752 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 }
4755 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004756 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4757 goto onError;
4758 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4759 goto onError;
4760 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4761 goto onError;
4762 return;
4763 onError:
4764 Py_DECREF(*exceptionObject);
4765 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 }
4767}
4768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769/* raises a UnicodeTranslateError */
4770static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 const Py_UNICODE *unicode, Py_ssize_t size,
4772 Py_ssize_t startpos, Py_ssize_t endpos,
4773 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774{
4775 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004776 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779}
4780
4781/* error handling callback helper:
4782 build arguments, call the callback and check the arguments,
4783 put the result into newpos and return the replacement string, which
4784 has to be freed by the caller */
4785static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004786 PyObject **errorHandler,
4787 const char *reason,
4788 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4789 Py_ssize_t startpos, Py_ssize_t endpos,
4790 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004792 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793
Martin v. Löwis412fb672006-04-13 06:34:32 +00004794 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 PyObject *restuple;
4796 PyObject *resunicode;
4797
4798 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 }
4803
4804 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004805 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808
4809 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004814 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 Py_DECREF(restuple);
4816 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 }
4818 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 &resunicode, &i_newpos)) {
4820 Py_DECREF(restuple);
4821 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004824 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 else
4826 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004827 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4829 Py_DECREF(restuple);
4830 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 Py_INCREF(resunicode);
4833 Py_DECREF(restuple);
4834 return resunicode;
4835}
4836
4837/* Lookup the character ch in the mapping and put the result in result,
4838 which must be decrefed by the caller.
4839 Return 0 on success, -1 on error */
4840static
4841int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4842{
4843 PyObject *w = PyInt_FromLong((long)c);
4844 PyObject *x;
4845
4846 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 x = PyObject_GetItem(mapping, w);
4849 Py_DECREF(w);
4850 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004851 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4852 /* No mapping found means: use 1:1 mapping. */
4853 PyErr_Clear();
4854 *result = NULL;
4855 return 0;
4856 } else
4857 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 }
4859 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004860 *result = x;
4861 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 }
4863 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004864 long value = PyInt_AS_LONG(x);
4865 long max = PyUnicode_GetMax();
4866 if (value < 0 || value > max) {
4867 PyErr_Format(PyExc_TypeError,
4868 "character mapping must be in range(0x%lx)", max+1);
4869 Py_DECREF(x);
4870 return -1;
4871 }
4872 *result = x;
4873 return 0;
4874 }
4875 else if (PyUnicode_Check(x)) {
4876 *result = x;
4877 return 0;
4878 }
4879 else {
4880 /* wrong return value */
4881 PyErr_SetString(PyExc_TypeError,
4882 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004883 Py_DECREF(x);
4884 return -1;
4885 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886}
4887/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004888 if not reallocate and adjust various state variables.
4889 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890static
Walter Dörwald4894c302003-10-24 14:25:28 +00004891int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004892 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004895 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004896 /* remember old output position */
4897 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4898 /* exponentially overallocate to minimize reallocations */
4899 if (requiredsize < 2 * oldsize)
4900 requiredsize = 2 * oldsize;
4901 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4902 return -1;
4903 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 }
4905 return 0;
4906}
4907/* lookup the character, put the result in the output string and adjust
4908 various state variables. Return a new reference to the object that
4909 was put in the output buffer in *result, or Py_None, if the mapping was
4910 undefined (in which case no character was written).
4911 The called must decref result.
4912 Return 0 on success, -1 on error. */
4913static
Walter Dörwald4894c302003-10-24 14:25:28 +00004914int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004915 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4916 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917{
Walter Dörwald4894c302003-10-24 14:25:28 +00004918 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004919 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004921 /* not found => default to 1:1 mapping */
4922 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 }
4924 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 /* no overflow check, because we know that the space is enough */
4928 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 }
4930 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004931 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4932 if (repsize==1) {
4933 /* no overflow check, because we know that the space is enough */
4934 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4935 }
4936 else if (repsize!=0) {
4937 /* more than one character */
4938 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4939 (insize - (curinp-startinp)) +
4940 repsize - 1;
4941 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4942 return -1;
4943 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4944 *outp += repsize;
4945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 }
4947 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 return 0;
4950}
4951
4952PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 Py_ssize_t size,
4954 PyObject *mapping,
4955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 /* output object */
4958 PyObject *res = NULL;
4959 /* pointers to the beginning and end+1 of input */
4960 const Py_UNICODE *startp = p;
4961 const Py_UNICODE *endp = p + size;
4962 /* pointer into the output */
4963 Py_UNICODE *str;
4964 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004965 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 char *reason = "character maps to <undefined>";
4967 PyObject *errorHandler = NULL;
4968 PyObject *exc = NULL;
4969 /* the following variable is used for caching string comparisons
4970 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4971 * 3=ignore, 4=xmlcharrefreplace */
4972 int known_errorHandler = -1;
4973
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004975 PyErr_BadArgument();
4976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978
4979 /* allocate enough for a simple 1:1 translation without
4980 replacements, if we need more, we'll resize */
4981 res = PyUnicode_FromUnicode(NULL, size);
4982 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004983 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004985 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004989 /* try to encode it */
4990 PyObject *x = NULL;
4991 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4992 Py_XDECREF(x);
4993 goto onError;
4994 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004995 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 if (x!=Py_None) /* it worked => adjust input pointer */
4997 ++p;
4998 else { /* untranslatable character */
4999 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5000 Py_ssize_t repsize;
5001 Py_ssize_t newpos;
5002 Py_UNICODE *uni2;
5003 /* startpos for collecting untranslatable chars */
5004 const Py_UNICODE *collstart = p;
5005 const Py_UNICODE *collend = p+1;
5006 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005008 /* find all untranslatable characters */
5009 while (collend < endp) {
5010 if (charmaptranslate_lookup(*collend, mapping, &x))
5011 goto onError;
5012 Py_XDECREF(x);
5013 if (x!=Py_None)
5014 break;
5015 ++collend;
5016 }
5017 /* cache callback name lookup
5018 * (if not done yet, i.e. it's the first error) */
5019 if (known_errorHandler==-1) {
5020 if ((errors==NULL) || (!strcmp(errors, "strict")))
5021 known_errorHandler = 1;
5022 else if (!strcmp(errors, "replace"))
5023 known_errorHandler = 2;
5024 else if (!strcmp(errors, "ignore"))
5025 known_errorHandler = 3;
5026 else if (!strcmp(errors, "xmlcharrefreplace"))
5027 known_errorHandler = 4;
5028 else
5029 known_errorHandler = 0;
5030 }
5031 switch (known_errorHandler) {
5032 case 1: /* strict */
5033 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005034 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005035 case 2: /* replace */
5036 /* No need to check for space, this is a 1:1 replacement */
5037 for (coll = collstart; coll<collend; ++coll)
5038 *str++ = '?';
5039 /* fall through */
5040 case 3: /* ignore */
5041 p = collend;
5042 break;
5043 case 4: /* xmlcharrefreplace */
5044 /* generate replacement (temporarily (mis)uses p) */
5045 for (p = collstart; p < collend; ++p) {
5046 char buffer[2+29+1+1];
5047 char *cp;
5048 sprintf(buffer, "&#%d;", (int)*p);
5049 if (charmaptranslate_makespace(&res, &str,
5050 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5051 goto onError;
5052 for (cp = buffer; *cp; ++cp)
5053 *str++ = *cp;
5054 }
5055 p = collend;
5056 break;
5057 default:
5058 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5059 reason, startp, size, &exc,
5060 collstart-startp, collend-startp, &newpos);
5061 if (repunicode == NULL)
5062 goto onError;
5063 /* generate replacement */
5064 repsize = PyUnicode_GET_SIZE(repunicode);
5065 if (charmaptranslate_makespace(&res, &str,
5066 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5067 Py_DECREF(repunicode);
5068 goto onError;
5069 }
5070 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5071 *str++ = *uni2;
5072 p = startp + newpos;
5073 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005074 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005075 }
5076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 /* Resize if we allocated to much */
5078 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005079 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005080 if (PyUnicode_Resize(&res, respos) < 0)
5081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 }
5083 Py_XDECREF(exc);
5084 Py_XDECREF(errorHandler);
5085 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 Py_XDECREF(res);
5089 Py_XDECREF(exc);
5090 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 return NULL;
5092}
5093
5094PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005095 PyObject *mapping,
5096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097{
5098 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005099
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 str = PyUnicode_FromObject(str);
5101 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005104 PyUnicode_GET_SIZE(str),
5105 mapping,
5106 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 Py_DECREF(str);
5108 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005109
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005110 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 Py_XDECREF(str);
5112 return NULL;
5113}
Tim Petersced69f82003-09-16 20:30:58 +00005114
Guido van Rossum9e896b32000-04-05 20:11:21 +00005115/* --- Decimal Encoder ---------------------------------------------------- */
5116
5117int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005118 Py_ssize_t length,
5119 char *output,
5120 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121{
5122 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 PyObject *errorHandler = NULL;
5124 PyObject *exc = NULL;
5125 const char *encoding = "decimal";
5126 const char *reason = "invalid decimal Unicode string";
5127 /* the following variable is used for caching string comparisons
5128 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5129 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130
5131 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005132 PyErr_BadArgument();
5133 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005134 }
5135
5136 p = s;
5137 end = s + length;
5138 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 register Py_UNICODE ch = *p;
5140 int decimal;
5141 PyObject *repunicode;
5142 Py_ssize_t repsize;
5143 Py_ssize_t newpos;
5144 Py_UNICODE *uni2;
5145 Py_UNICODE *collstart;
5146 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005147
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005148 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005149 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005150 ++p;
5151 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005152 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 decimal = Py_UNICODE_TODECIMAL(ch);
5154 if (decimal >= 0) {
5155 *output++ = '0' + decimal;
5156 ++p;
5157 continue;
5158 }
5159 if (0 < ch && ch < 256) {
5160 *output++ = (char)ch;
5161 ++p;
5162 continue;
5163 }
5164 /* All other characters are considered unencodable */
5165 collstart = p;
5166 collend = p+1;
5167 while (collend < end) {
5168 if ((0 < *collend && *collend < 256) ||
5169 !Py_UNICODE_ISSPACE(*collend) ||
5170 Py_UNICODE_TODECIMAL(*collend))
5171 break;
5172 }
5173 /* cache callback name lookup
5174 * (if not done yet, i.e. it's the first error) */
5175 if (known_errorHandler==-1) {
5176 if ((errors==NULL) || (!strcmp(errors, "strict")))
5177 known_errorHandler = 1;
5178 else if (!strcmp(errors, "replace"))
5179 known_errorHandler = 2;
5180 else if (!strcmp(errors, "ignore"))
5181 known_errorHandler = 3;
5182 else if (!strcmp(errors, "xmlcharrefreplace"))
5183 known_errorHandler = 4;
5184 else
5185 known_errorHandler = 0;
5186 }
5187 switch (known_errorHandler) {
5188 case 1: /* strict */
5189 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5190 goto onError;
5191 case 2: /* replace */
5192 for (p = collstart; p < collend; ++p)
5193 *output++ = '?';
5194 /* fall through */
5195 case 3: /* ignore */
5196 p = collend;
5197 break;
5198 case 4: /* xmlcharrefreplace */
5199 /* generate replacement (temporarily (mis)uses p) */
5200 for (p = collstart; p < collend; ++p)
5201 output += sprintf(output, "&#%d;", (int)*p);
5202 p = collend;
5203 break;
5204 default:
5205 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5206 encoding, reason, s, length, &exc,
5207 collstart-s, collend-s, &newpos);
5208 if (repunicode == NULL)
5209 goto onError;
5210 /* generate replacement */
5211 repsize = PyUnicode_GET_SIZE(repunicode);
5212 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5213 Py_UNICODE ch = *uni2;
5214 if (Py_UNICODE_ISSPACE(ch))
5215 *output++ = ' ';
5216 else {
5217 decimal = Py_UNICODE_TODECIMAL(ch);
5218 if (decimal >= 0)
5219 *output++ = '0' + decimal;
5220 else if (0 < ch && ch < 256)
5221 *output++ = (char)ch;
5222 else {
5223 Py_DECREF(repunicode);
5224 raise_encode_exception(&exc, encoding,
5225 s, length, collstart-s, collend-s, reason);
5226 goto onError;
5227 }
5228 }
5229 }
5230 p = s + newpos;
5231 Py_DECREF(repunicode);
5232 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233 }
5234 /* 0-terminate the output string */
5235 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 Py_XDECREF(exc);
5237 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005238 return 0;
5239
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 Py_XDECREF(exc);
5242 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005243 return -1;
5244}
5245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246/* --- Helpers ------------------------------------------------------------ */
5247
Eric Smitha9f7d622008-02-17 19:46:49 +00005248#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005249#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005250
5251#include "stringlib/count.h"
5252#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005253#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005254#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005255
Fredrik Lundhc8162812006-05-26 19:33:03 +00005256/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005257#define ADJUST_INDICES(start, end, len) \
5258 if (end > len) \
5259 end = len; \
5260 else if (end < 0) { \
5261 end += len; \
5262 if (end < 0) \
5263 end = 0; \
5264 } \
5265 if (start < 0) { \
5266 start += len; \
5267 if (start < 0) \
5268 start = 0; \
5269 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005270
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005272 PyObject *substr,
5273 Py_ssize_t start,
5274 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005277 PyUnicodeObject* str_obj;
5278 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005279
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005280 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5281 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005282 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005283 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5284 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005285 Py_DECREF(str_obj);
5286 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 }
Tim Petersced69f82003-09-16 20:30:58 +00005288
Antoine Pitrou64672132010-01-13 07:55:48 +00005289 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5292 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005293 );
5294
5295 Py_DECREF(sub_obj);
5296 Py_DECREF(str_obj);
5297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return result;
5299}
5300
Martin v. Löwis18e16552006-02-15 17:27:45 +00005301Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005302 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005303 Py_ssize_t start,
5304 Py_ssize_t end,
5305 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005307 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005308
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005309 str = PyUnicode_FromObject(str);
5310 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005311 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005312 sub = PyUnicode_FromObject(sub);
5313 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005314 Py_DECREF(str);
5315 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 }
Tim Petersced69f82003-09-16 20:30:58 +00005317
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005318 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005319 result = stringlib_find_slice(
5320 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5321 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5322 start, end
5323 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005324 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005325 result = stringlib_rfind_slice(
5326 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5327 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5328 start, end
5329 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005330
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005331 Py_DECREF(str);
5332 Py_DECREF(sub);
5333
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 return result;
5335}
5336
Tim Petersced69f82003-09-16 20:30:58 +00005337static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005339 PyUnicodeObject *substring,
5340 Py_ssize_t start,
5341 Py_ssize_t end,
5342 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 if (substring->length == 0)
5345 return 1;
5346
Antoine Pitrou64672132010-01-13 07:55:48 +00005347 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 end -= substring->length;
5349 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005357 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
5359
5360 return 0;
5361}
5362
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005373 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 Py_DECREF(str);
5377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
Tim Petersced69f82003-09-16 20:30:58 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 (PyUnicodeObject *)substr,
5382 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5386}
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388/* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5390
Tim Petersced69f82003-09-16 20:30:58 +00005391static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005393 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394{
5395
5396 PyUnicodeObject *u;
5397
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005401
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5403
Tim Peters7a29bd52001-09-12 03:03:31 +00005404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 }
5412 return (PyObject*) u;
5413}
5414
Tim Petersced69f82003-09-16 20:30:58 +00005415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416int fixupper(PyUnicodeObject *self)
5417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 Py_UNICODE *s = self->str;
5420 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005423 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 *s = ch;
5429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 s++;
5431 }
5432
5433 return status;
5434}
5435
Tim Petersced69f82003-09-16 20:30:58 +00005436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437int fixlower(PyUnicodeObject *self)
5438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 Py_UNICODE *s = self->str;
5441 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005444 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 *s = ch;
5450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 s++;
5452 }
5453
5454 return status;
5455}
5456
Tim Petersced69f82003-09-16 20:30:58 +00005457static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458int fixswapcase(PyUnicodeObject *self)
5459{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_UNICODE *s = self->str;
5462 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5471 }
5472 s++;
5473 }
5474
5475 return status;
5476}
5477
Tim Petersced69f82003-09-16 20:30:58 +00005478static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479int fixcapitalize(PyUnicodeObject *self)
5480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005482 Py_UNICODE *s = self->str;
5483 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005484
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005485 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005486 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005487 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5496 }
5497 s++;
5498 }
5499 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500}
5501
5502static
5503int fixtitle(PyUnicodeObject *self)
5504{
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5508
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5515 }
5516 else
5517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005523 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005529
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 }
5537 return 1;
5538}
5539
Tim Peters8ce9f162004-08-27 01:49:32 +00005540PyObject *
5541PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Tim Peters8ce9f162004-08-27 01:49:32 +00005543 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005546 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005547 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005554 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Tim Peters05eba1f2004-08-27 21:32:02 +00005556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005558 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 }
5560
Tim Peters91879ab2004-08-27 22:35:44 +00005561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5567 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005573 }
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5581 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005582 }
5583
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005587 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005588 sep = &blank;
5589 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005591 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005599 }
5600 }
5601
5602 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005603 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005604 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005605 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005608
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005612
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5621 }
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005626
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005629
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005633 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005638 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005639 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5650 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005652 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5660 }
5661 Py_DECREF(item);
5662 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005664
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5667 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005668 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005669 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005670
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005671 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005672 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 return (PyObject *)res;
5675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005676 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005677 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005678 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 Py_DECREF(item);
5680 /* fall through */
5681
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005683 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 return NULL;
5687}
5688
Tim Petersced69f82003-09-16 20:30:58 +00005689static
5690PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
5695 PyUnicodeObject *u;
5696
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5701
Tim Peters7a29bd52001-09-12 03:03:31 +00005702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 Py_INCREF(self);
5704 return self;
5705 }
5706
Neal Norwitze7d8be82008-07-31 17:17:14 +00005707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719 }
5720
5721 return u;
5722}
5723
Antoine Pitrou64672132010-01-13 07:55:48 +00005724PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 string = PyUnicode_FromObject(string);
5729 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Antoine Pitrou64672132010-01-13 07:55:48 +00005732 list = stringlib_splitlines(
5733 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5734 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736 Py_DECREF(string);
5737 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738}
5739
Tim Petersced69f82003-09-16 20:30:58 +00005740static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005742 PyUnicodeObject *substring,
5743 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005746 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005749 return stringlib_split_whitespace(
5750 (PyObject*) self, self->str, self->length, maxcount
5751 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Antoine Pitrou64672132010-01-13 07:55:48 +00005753 return stringlib_split(
5754 (PyObject*) self, self->str, self->length,
5755 substring->str, substring->length,
5756 maxcount
5757 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
Tim Petersced69f82003-09-16 20:30:58 +00005760static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005762 PyUnicodeObject *substring,
5763 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005766 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005769 return stringlib_rsplit_whitespace(
5770 (PyObject*) self, self->str, self->length, maxcount
5771 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772
Antoine Pitrou64672132010-01-13 07:55:48 +00005773 return stringlib_rsplit(
5774 (PyObject*) self, self->str, self->length,
5775 substring->str, substring->length,
5776 maxcount
5777 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778}
5779
5780static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005782 PyUnicodeObject *str1,
5783 PyUnicodeObject *str2,
5784 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785{
5786 PyUnicodeObject *u;
5787
5788 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005790 else if (maxcount == 0 || self->length == 0)
5791 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Fredrik Lundh347ee272006-05-24 16:35:18 +00005793 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005794 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005795 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005796 if (str1->length == 0)
5797 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005798 if (str1->length == 1) {
5799 /* replace characters */
5800 Py_UNICODE u1, u2;
5801 if (!findchar(self->str, self->length, str1->str[0]))
5802 goto nothing;
5803 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5804 if (!u)
5805 return NULL;
5806 Py_UNICODE_COPY(u->str, self->str, self->length);
5807 u1 = str1->str[0];
5808 u2 = str2->str[0];
5809 for (i = 0; i < u->length; i++)
5810 if (u->str[i] == u1) {
5811 if (--maxcount < 0)
5812 break;
5813 u->str[i] = u2;
5814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005816 i = stringlib_find(
5817 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005819 if (i < 0)
5820 goto nothing;
5821 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5822 if (!u)
5823 return NULL;
5824 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005825
5826 /* change everything in-place, starting with this one */
5827 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5828 i += str1->length;
5829
5830 while ( --maxcount > 0) {
5831 i = stringlib_find(self->str+i, self->length-i,
5832 str1->str, str1->length,
5833 i);
5834 if (i == -1)
5835 break;
5836 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5837 i += str1->length;
5838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005841
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005842 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005843 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_UNICODE *p;
5845
5846 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005847 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5848 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005849 if (n == 0)
5850 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005851 /* new_size = self->length + n * (str2->length - str1->length)); */
5852 delta = (str2->length - str1->length);
5853 if (delta == 0) {
5854 new_size = self->length;
5855 } else {
5856 product = n * (str2->length - str1->length);
5857 if ((product / (str2->length - str1->length)) != n) {
5858 PyErr_SetString(PyExc_OverflowError,
5859 "replace string is too long");
5860 return NULL;
5861 }
5862 new_size = self->length + product;
5863 if (new_size < 0) {
5864 PyErr_SetString(PyExc_OverflowError,
5865 "replace string is too long");
5866 return NULL;
5867 }
5868 }
5869 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005870 if (!u)
5871 return NULL;
5872 i = 0;
5873 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005874 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005875 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005876 while (n-- > 0) {
5877 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005878 j = stringlib_find(self->str+i, self->length-i,
5879 str1->str, str1->length,
5880 i);
5881 if (j == -1)
5882 break;
5883 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005884 /* copy unchanged part [i:j] */
5885 Py_UNICODE_COPY(p, self->str+i, j-i);
5886 p += j - i;
5887 }
5888 /* copy substitution string */
5889 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005890 Py_UNICODE_COPY(p, str2->str, str2->length);
5891 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005892 }
5893 i = j + str1->length;
5894 }
5895 if (i < self->length)
5896 /* copy tail [i:] */
5897 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005899 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 while (n > 0) {
5901 Py_UNICODE_COPY(p, str2->str, str2->length);
5902 p += str2->length;
5903 if (--n <= 0)
5904 break;
5905 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005907 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 }
5909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005911
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005912 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005913 /* nothing to replace; return original string (when possible) */
5914 if (PyUnicode_CheckExact(self)) {
5915 Py_INCREF(self);
5916 return (PyObject *) self;
5917 }
5918 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919}
5920
5921/* --- Unicode Object Methods --------------------------------------------- */
5922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005923PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005924 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925\n\
5926Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005927characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
5929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005930unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return fixup(self, fixtitle);
5933}
5934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005936 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937\n\
5938Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005942unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 return fixup(self, fixcapitalize);
5945}
5946
5947#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005949 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950\n\
5951Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005955unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957 PyObject *list;
5958 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Split into words */
5962 list = split(self, NULL, -1);
5963 if (!list)
5964 return NULL;
5965
5966 /* Capitalize each word */
5967 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5968 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005969 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 if (item == NULL)
5971 goto onError;
5972 Py_DECREF(PyList_GET_ITEM(list, i));
5973 PyList_SET_ITEM(list, i, item);
5974 }
5975
5976 /* Join the words to form a new string */
5977 item = PyUnicode_Join(NULL, list);
5978
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 Py_DECREF(list);
5981 return (PyObject *)item;
5982}
5983#endif
5984
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005985/* Argument converter. Coerces to a single unicode character */
5986
5987static int
5988convert_uc(PyObject *obj, void *addr)
5989{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005990 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5991 PyObject *uniobj;
5992 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005993
Benjamin Peterson857ce152009-01-31 16:29:18 +00005994 uniobj = PyUnicode_FromObject(obj);
5995 if (uniobj == NULL) {
5996 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005997 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00005998 return 0;
5999 }
6000 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6001 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006002 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006003 Py_DECREF(uniobj);
6004 return 0;
6005 }
6006 unistr = PyUnicode_AS_UNICODE(uniobj);
6007 *fillcharloc = unistr[0];
6008 Py_DECREF(uniobj);
6009 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006010}
6011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006012PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006013 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006015Return S centered in a Unicode string of length width. Padding is\n\
6016done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
6018static PyObject *
6019unicode_center(PyUnicodeObject *self, PyObject *args)
6020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006021 Py_ssize_t marg, left;
6022 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006023 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Thomas Woutersde017742006-02-16 19:34:37 +00006025 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 return NULL;
6027
Tim Peters7a29bd52001-09-12 03:03:31 +00006028 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 Py_INCREF(self);
6030 return (PyObject*) self;
6031 }
6032
6033 marg = width - self->length;
6034 left = marg / 2 + (marg & width & 1);
6035
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006036 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037}
6038
Marc-André Lemburge5034372000-08-08 08:04:29 +00006039#if 0
6040
6041/* This code should go into some future Unicode collation support
6042 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006043 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006044
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006045/* speedy UTF-16 code point order comparison */
6046/* gleaned from: */
6047/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6048
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006049static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006050{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006051 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006052 0, 0, 0, 0, 0, 0, 0, 0,
6053 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006054 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006055};
6056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057static int
6058unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006060 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 Py_UNICODE *s1 = str1->str;
6063 Py_UNICODE *s2 = str2->str;
6064
6065 len1 = str1->length;
6066 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006069 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006070
6071 c1 = *s1++;
6072 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006073
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006074 if (c1 > (1<<11) * 26)
6075 c1 += utf16Fixup[c1>>11];
6076 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006079
6080 if (c1 != c2)
6081 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006082
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 }
6085
6086 return (len1 < len2) ? -1 : (len1 != len2);
6087}
6088
Marc-André Lemburge5034372000-08-08 08:04:29 +00006089#else
6090
6091static int
6092unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6093{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006095
6096 Py_UNICODE *s1 = str1->str;
6097 Py_UNICODE *s2 = str2->str;
6098
6099 len1 = str1->length;
6100 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Marc-André Lemburge5034372000-08-08 08:04:29 +00006102 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006103 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006104
Fredrik Lundh45714e92001-06-26 16:39:36 +00006105 c1 = *s1++;
6106 c2 = *s2++;
6107
6108 if (c1 != c2)
6109 return (c1 < c2) ? -1 : 1;
6110
Marc-André Lemburge5034372000-08-08 08:04:29 +00006111 len1--; len2--;
6112 }
6113
6114 return (len1 < len2) ? -1 : (len1 != len2);
6115}
6116
6117#endif
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006120 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
6122 PyUnicodeObject *u = NULL, *v = NULL;
6123 int result;
6124
6125 /* Coerce the two arguments */
6126 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6127 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6130 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Thomas Wouters7e474022000-07-16 12:04:32 +00006133 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006135 Py_DECREF(u);
6136 Py_DECREF(v);
6137 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
6139
6140 result = unicode_compare(u, v);
6141
6142 Py_DECREF(u);
6143 Py_DECREF(v);
6144 return result;
6145
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 Py_XDECREF(u);
6148 Py_XDECREF(v);
6149 return -1;
6150}
6151
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006152PyObject *PyUnicode_RichCompare(PyObject *left,
6153 PyObject *right,
6154 int op)
6155{
6156 int result;
6157
6158 result = PyUnicode_Compare(left, right);
6159 if (result == -1 && PyErr_Occurred())
6160 goto onError;
6161
6162 /* Convert the return value to a Boolean */
6163 switch (op) {
6164 case Py_EQ:
6165 result = (result == 0);
6166 break;
6167 case Py_NE:
6168 result = (result != 0);
6169 break;
6170 case Py_LE:
6171 result = (result <= 0);
6172 break;
6173 case Py_GE:
6174 result = (result >= 0);
6175 break;
6176 case Py_LT:
6177 result = (result == -1);
6178 break;
6179 case Py_GT:
6180 result = (result == 1);
6181 break;
6182 }
6183 return PyBool_FromLong(result);
6184
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006185 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006186
6187 /* Standard case
6188
6189 Type errors mean that PyUnicode_FromObject() could not convert
6190 one of the arguments (usually the right hand side) to Unicode,
6191 ie. we can't handle the comparison request. However, it is
6192 possible that the other object knows a comparison method, which
6193 is why we return Py_NotImplemented to give the other object a
6194 chance.
6195
6196 */
6197 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6198 PyErr_Clear();
6199 Py_INCREF(Py_NotImplemented);
6200 return Py_NotImplemented;
6201 }
6202 if (op != Py_EQ && op != Py_NE)
6203 return NULL;
6204
6205 /* Equality comparison.
6206
6207 This is a special case: we silence any PyExc_UnicodeDecodeError
6208 and instead turn it into a PyErr_UnicodeWarning.
6209
6210 */
6211 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6212 return NULL;
6213 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006214 if (PyErr_Warn(PyExc_UnicodeWarning,
6215 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006216 "Unicode equal comparison "
6217 "failed to convert both arguments to Unicode - "
6218 "interpreting them as being unequal" :
6219 "Unicode unequal comparison "
6220 "failed to convert both arguments to Unicode - "
6221 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006222 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006223 return NULL;
6224 result = (op == Py_NE);
6225 return PyBool_FromLong(result);
6226}
6227
Guido van Rossum403d68b2000-03-13 15:55:09 +00006228int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006229 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006230{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006231 PyObject *str, *sub;
6232 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006233
6234 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006235 sub = PyUnicode_FromObject(element);
6236 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006237 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006238 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006239
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006240 str = PyUnicode_FromObject(container);
6241 if (!str) {
6242 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006243 return -1;
6244 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006245
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006246 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006247
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006248 Py_DECREF(str);
6249 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006250
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006251 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252}
6253
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254/* Concat to string or Unicode object giving a new Unicode object. */
6255
6256PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006257 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258{
6259 PyUnicodeObject *u = NULL, *v = NULL, *w;
6260
6261 /* Coerce the two arguments */
6262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006264 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
6269 /* Shortcuts */
6270 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006271 Py_DECREF(v);
6272 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 }
6274 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006275 Py_DECREF(u);
6276 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
6278
6279 /* Concat the two Unicode strings */
6280 w = _PyUnicode_New(u->length + v->length);
6281 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 Py_UNICODE_COPY(w->str, u->str, u->length);
6284 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6285
6286 Py_DECREF(u);
6287 Py_DECREF(v);
6288 return (PyObject *)w;
6289
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 Py_XDECREF(u);
6292 Py_XDECREF(v);
6293 return NULL;
6294}
6295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006299Return the number of non-overlapping occurrences of substring sub in\n\
6300Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
6303static PyObject *
6304unicode_count(PyUnicodeObject *self, PyObject *args)
6305{
6306 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006308 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 PyObject *result;
6310
Guido van Rossumb8872e62000-05-09 14:14:27 +00006311 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006312 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
6314
6315 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006316 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006318 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006319
Antoine Pitrou64672132010-01-13 07:55:48 +00006320 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006321 result = PyInt_FromSsize_t(
6322 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006323 substring->str, substring->length,
6324 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006325 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006328
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 return result;
6330}
6331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006332PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006333 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006335Encodes S using the codec registered for encoding. encoding defaults\n\
6336to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006337handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6339'xmlcharrefreplace' as well as any other name registered with\n\
6340codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341
6342static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006343unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006345 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 char *encoding = NULL;
6347 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006348 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006349
Benjamin Peterson332d7212009-09-18 21:14:55 +00006350 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6351 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006353 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006354 if (v == NULL)
6355 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006356 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006357 PyErr_Format(PyExc_TypeError,
6358 "encoder did not return a string/unicode object "
6359 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006360 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006361 Py_DECREF(v);
6362 return NULL;
6363 }
6364 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006366 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006367 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006368}
6369
6370PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006371 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006372\n\
6373Decodes S using the codec registered for encoding. encoding defaults\n\
6374to the default encoding. errors may be given to set a different error\n\
6375handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6376a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6377as well as any other name registerd with codecs.register_error that is\n\
6378able to handle UnicodeDecodeErrors.");
6379
6380static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006381unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006382{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006383 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384 char *encoding = NULL;
6385 char *errors = NULL;
6386 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006387
Benjamin Peterson332d7212009-09-18 21:14:55 +00006388 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6389 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006390 return NULL;
6391 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006392 if (v == NULL)
6393 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006394 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006395 PyErr_Format(PyExc_TypeError,
6396 "decoder did not return a string/unicode object "
6397 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006398 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006399 Py_DECREF(v);
6400 return NULL;
6401 }
6402 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006403
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006404 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006409 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410\n\
6411Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
6414static PyObject*
6415unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6416{
6417 Py_UNICODE *e;
6418 Py_UNICODE *p;
6419 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006420 Py_UNICODE *qe;
6421 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 PyUnicodeObject *u;
6423 int tabsize = 8;
6424
6425 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
Thomas Wouters7e474022000-07-16 12:04:32 +00006428 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006429 i = 0; /* chars up to and including most recent \n or \r */
6430 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6431 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 for (p = self->str; p < e; p++)
6433 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006434 if (tabsize > 0) {
6435 incr = tabsize - (j % tabsize); /* cannot overflow */
6436 if (j > PY_SSIZE_T_MAX - incr)
6437 goto overflow1;
6438 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006439 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006442 if (j > PY_SSIZE_T_MAX - 1)
6443 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 j++;
6445 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006446 if (i > PY_SSIZE_T_MAX - j)
6447 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006449 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
6451 }
6452
Guido van Rossum5bdff602008-03-11 21:18:06 +00006453 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 /* Second pass: create output string and fill it */
6457 u = _PyUnicode_New(i + j);
6458 if (!u)
6459 return NULL;
6460
Guido van Rossum5bdff602008-03-11 21:18:06 +00006461 j = 0; /* same as in first pass */
6462 q = u->str; /* next output char */
6463 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465 for (p = self->str; p < e; p++)
6466 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006467 if (tabsize > 0) {
6468 i = tabsize - (j % tabsize);
6469 j += i;
6470 while (i--) {
6471 if (q >= qe)
6472 goto overflow2;
6473 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006474 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006475 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006476 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006477 else {
6478 if (q >= qe)
6479 goto overflow2;
6480 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006481 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (*p == '\n' || *p == '\r')
6483 j = 0;
6484 }
6485
6486 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006487
6488 overflow2:
6489 Py_DECREF(u);
6490 overflow1:
6491 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006495PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497\n\
6498Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006499such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500arguments start and end are interpreted as in slice notation.\n\
6501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
6504static PyObject *
6505unicode_find(PyUnicodeObject *self, PyObject *args)
6506{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006507 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006508 Py_ssize_t start;
6509 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006510 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Facundo Batista57d56692007-11-16 18:04:14 +00006512 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006515 result = stringlib_find_slice(
6516 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6517 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6518 start, end
6519 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
6521 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006522
6523 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
6526static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
6529 if (index < 0 || index >= self->length) {
6530 PyErr_SetString(PyExc_IndexError, "string index out of range");
6531 return NULL;
6532 }
6533
6534 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6535}
6536
6537static long
6538unicode_hash(PyUnicodeObject *self)
6539{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006540 /* Since Unicode objects compare equal to their ASCII string
6541 counterparts, they should use the individual character values
6542 as basis for their hash value. This is needed to assure that
6543 strings and Unicode objects behave in the same way as
6544 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006547 register Py_UNICODE *p;
6548 register long x;
6549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006551 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006552 len = PyUnicode_GET_SIZE(self);
6553 p = PyUnicode_AS_UNICODE(self);
6554 x = *p << 7;
6555 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006556 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006557 x ^= PyUnicode_GET_SIZE(self);
6558 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006559 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006560 self->hash = x;
6561 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject *
6570unicode_index(PyUnicodeObject *self, PyObject *args)
6571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006573 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006574 Py_ssize_t start;
6575 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Facundo Batista57d56692007-11-16 18:04:14 +00006577 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006580 result = stringlib_find_slice(
6581 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6582 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6583 start, end
6584 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006587
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 if (result < 0) {
6589 PyErr_SetString(PyExc_ValueError, "substring not found");
6590 return NULL;
6591 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006592
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006597 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006599Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
6605 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6606 register const Py_UNICODE *e;
6607 int cased;
6608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 /* Shortcut for single character strings */
6610 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006611 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006613 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006614 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006615 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 e = p + PyUnicode_GET_SIZE(self);
6618 cased = 0;
6619 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006620 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006621
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006622 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6623 return PyBool_FromLong(0);
6624 else if (!cased && Py_UNICODE_ISLOWER(ch))
6625 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006627 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628}
6629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006630PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006631 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006633Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
6636static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006637unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6640 register const Py_UNICODE *e;
6641 int cased;
6642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 /* Shortcut for single character strings */
6644 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006645 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006647 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006648 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 e = p + PyUnicode_GET_SIZE(self);
6652 cased = 0;
6653 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006654 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6657 return PyBool_FromLong(0);
6658 else if (!cased && Py_UNICODE_ISUPPER(ch))
6659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006667Return True if S is a titlecased string and there is at least one\n\
6668character in S, i.e. upper- and titlecase characters may only\n\
6669follow uncased characters and lowercase characters only cased ones.\n\
6670Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006673unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
6675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6676 register const Py_UNICODE *e;
6677 int cased, previous_is_cased;
6678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* Shortcut for single character strings */
6680 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006681 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6682 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006684 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006685 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006686 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 e = p + PyUnicode_GET_SIZE(self);
6689 cased = 0;
6690 previous_is_cased = 0;
6691 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006693
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006694 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6695 if (previous_is_cased)
6696 return PyBool_FromLong(0);
6697 previous_is_cased = 1;
6698 cased = 1;
6699 }
6700 else if (Py_UNICODE_ISLOWER(ch)) {
6701 if (!previous_is_cased)
6702 return PyBool_FromLong(0);
6703 previous_is_cased = 1;
6704 cased = 1;
6705 }
6706 else
6707 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006713 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006715Return True if all characters in S are whitespace\n\
6716and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6722 register const Py_UNICODE *e;
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 /* Shortcut for single character strings */
6725 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006726 Py_UNICODE_ISSPACE(*p))
6727 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006730 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 e = p + PyUnicode_GET_SIZE(self);
6734 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006735 if (!Py_UNICODE_ISSPACE(*p))
6736 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006742 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006744Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006746
6747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006748unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006749{
6750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6751 register const Py_UNICODE *e;
6752
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006753 /* Shortcut for single character strings */
6754 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 Py_UNICODE_ISALPHA(*p))
6756 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006757
6758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006760 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006761
6762 e = p + PyUnicode_GET_SIZE(self);
6763 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006764 if (!Py_UNICODE_ISALPHA(*p))
6765 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006768}
6769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006770PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006771 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006773Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775
6776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006777unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778{
6779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6780 register const Py_UNICODE *e;
6781
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006784 Py_UNICODE_ISALNUM(*p))
6785 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786
6787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006788 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790
6791 e = p + PyUnicode_GET_SIZE(self);
6792 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006793 if (!Py_UNICODE_ISALNUM(*p))
6794 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006813 Py_UNICODE_ISDECIMAL(*p))
6814 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006817 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 e = p + PyUnicode_GET_SIZE(self);
6821 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006822 if (!Py_UNICODE_ISDECIMAL(*p))
6823 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006831Return True if all characters in S are digits\n\
6832and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006835unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
6837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6838 register const Py_UNICODE *e;
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006842 Py_UNICODE_ISDIGIT(*p))
6843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 e = p + PyUnicode_GET_SIZE(self);
6850 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006851 if (!Py_UNICODE_ISDIGIT(*p))
6852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 Py_UNICODE_ISNUMERIC(*p))
6872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006880 if (!Py_UNICODE_ISNUMERIC(*p))
6881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006887 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888\n\
6889Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006890iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006895 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899unicode_length(PyUnicodeObject *self)
6900{
6901 return self->length;
6902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006907Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006908done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
6910static PyObject *
6911unicode_ljust(PyUnicodeObject *self, PyObject *args)
6912{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006913 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006914 Py_UNICODE fillchar = ' ';
6915
Martin v. Löwis412fb672006-04-13 06:34:32 +00006916 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 return NULL;
6918
Tim Peters7a29bd52001-09-12 03:03:31 +00006919 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 Py_INCREF(self);
6921 return (PyObject*) self;
6922 }
6923
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006924 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006927PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006928 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006933unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 return fixup(self, fixlower);
6936}
6937
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006938#define LEFTSTRIP 0
6939#define RIGHTSTRIP 1
6940#define BOTHSTRIP 2
6941
6942/* Arrays indexed by above */
6943static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6944
6945#define STRIPNAME(i) (stripformat[i]+3)
6946
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006947/* externally visible for str.strip(unicode) */
6948PyObject *
6949_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6950{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006951 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6952 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6953 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6954 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6955 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006956
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006958
Benjamin Peterson857ce152009-01-31 16:29:18 +00006959 i = 0;
6960 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6962 i++;
6963 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006964 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006965
Benjamin Peterson857ce152009-01-31 16:29:18 +00006966 j = len;
6967 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 do {
6969 j--;
6970 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6971 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006972 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006973
Benjamin Peterson857ce152009-01-31 16:29:18 +00006974 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006975 Py_INCREF(self);
6976 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006977 }
6978 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006979 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006980}
6981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006984do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006986 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6987 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006988
Benjamin Peterson857ce152009-01-31 16:29:18 +00006989 i = 0;
6990 if (striptype != RIGHTSTRIP) {
6991 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6992 i++;
6993 }
6994 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006995
Benjamin Peterson857ce152009-01-31 16:29:18 +00006996 j = len;
6997 if (striptype != LEFTSTRIP) {
6998 do {
6999 j--;
7000 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7001 j++;
7002 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007003
Benjamin Peterson857ce152009-01-31 16:29:18 +00007004 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7005 Py_INCREF(self);
7006 return (PyObject*)self;
7007 }
7008 else
7009 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010}
7011
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007012
7013static PyObject *
7014do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007016 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017
Benjamin Peterson857ce152009-01-31 16:29:18 +00007018 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7019 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020
Benjamin Peterson857ce152009-01-31 16:29:18 +00007021 if (sep != NULL && sep != Py_None) {
7022 if (PyUnicode_Check(sep))
7023 return _PyUnicode_XStrip(self, striptype, sep);
7024 else if (PyString_Check(sep)) {
7025 PyObject *res;
7026 sep = PyUnicode_FromObject(sep);
7027 if (sep==NULL)
7028 return NULL;
7029 res = _PyUnicode_XStrip(self, striptype, sep);
7030 Py_DECREF(sep);
7031 return res;
7032 }
7033 else {
7034 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007035 "%s arg must be None, unicode or str",
7036 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007037 return NULL;
7038 }
7039 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040
Benjamin Peterson857ce152009-01-31 16:29:18 +00007041 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042}
7043
7044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007045PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007046 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047\n\
7048Return a copy of the string S with leading and trailing\n\
7049whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007050If chars is given and not None, remove characters in chars instead.\n\
7051If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052
7053static PyObject *
7054unicode_strip(PyUnicodeObject *self, PyObject *args)
7055{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007056 if (PyTuple_GET_SIZE(args) == 0)
7057 return do_strip(self, BOTHSTRIP); /* Common case */
7058 else
7059 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060}
7061
7062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007064 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065\n\
7066Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007067If chars is given and not None, remove characters in chars instead.\n\
7068If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
7070static PyObject *
7071unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 if (PyTuple_GET_SIZE(args) == 0)
7074 return do_strip(self, LEFTSTRIP); /* Common case */
7075 else
7076 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077}
7078
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082\n\
7083Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007084If chars is given and not None, remove characters in chars instead.\n\
7085If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
7087static PyObject *
7088unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 if (PyTuple_GET_SIZE(args) == 0)
7091 return do_strip(self, RIGHTSTRIP); /* Common case */
7092 else
7093 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094}
7095
7096
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007098unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
7100 PyUnicodeObject *u;
7101 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007102 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007103 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105 if (len < 0)
7106 len = 0;
7107
Tim Peters7a29bd52001-09-12 03:03:31 +00007108 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 /* no repeat, return original string */
7110 Py_INCREF(str);
7111 return (PyObject*) str;
7112 }
Tim Peters8f422462000-09-09 06:13:41 +00007113
7114 /* ensure # of chars needed doesn't overflow int and # of bytes
7115 * needed doesn't overflow size_t
7116 */
7117 nchars = len * str->length;
7118 if (len && nchars / len != str->length) {
7119 PyErr_SetString(PyExc_OverflowError,
7120 "repeated string is too long");
7121 return NULL;
7122 }
7123 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7124 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7125 PyErr_SetString(PyExc_OverflowError,
7126 "repeated string is too long");
7127 return NULL;
7128 }
7129 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 if (!u)
7131 return NULL;
7132
7133 p = u->str;
7134
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007135 if (str->length == 1 && len > 0) {
7136 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007137 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007138 Py_ssize_t done = 0; /* number of characters copied this far */
7139 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007140 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007141 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007142 }
7143 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007144 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007145 Py_UNICODE_COPY(p+done, p, n);
7146 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007147 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150 return (PyObject*) u;
7151}
7152
7153PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007154 PyObject *subobj,
7155 PyObject *replobj,
7156 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157{
7158 PyObject *self;
7159 PyObject *str1;
7160 PyObject *str2;
7161 PyObject *result;
7162
7163 self = PyUnicode_FromObject(obj);
7164 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 str1 = PyUnicode_FromObject(subobj);
7167 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007168 Py_DECREF(self);
7169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 }
7171 str2 = PyUnicode_FromObject(replobj);
7172 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007173 Py_DECREF(self);
7174 Py_DECREF(str1);
7175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
Tim Petersced69f82003-09-16 20:30:58 +00007177 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007178 (PyUnicodeObject *)str1,
7179 (PyUnicodeObject *)str2,
7180 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 Py_DECREF(self);
7182 Py_DECREF(str1);
7183 Py_DECREF(str2);
7184 return result;
7185}
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007188 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189\n\
7190Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007191old replaced by new. If the optional argument count is\n\
7192given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193
7194static PyObject*
7195unicode_replace(PyUnicodeObject *self, PyObject *args)
7196{
7197 PyUnicodeObject *str1;
7198 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007199 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 PyObject *result;
7201
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 return NULL;
7204 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7205 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007208 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007209 Py_DECREF(str1);
7210 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213 result = replace(self, str1, str2, maxcount);
7214
7215 Py_DECREF(str1);
7216 Py_DECREF(str2);
7217 return result;
7218}
7219
7220static
7221PyObject *unicode_repr(PyObject *unicode)
7222{
7223 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007224 PyUnicode_GET_SIZE(unicode),
7225 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007228PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007229 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230\n\
7231Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007232such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233arguments start and end are interpreted as in slice notation.\n\
7234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject *
7238unicode_rfind(PyUnicodeObject *self, PyObject *args)
7239{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007240 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007241 Py_ssize_t start;
7242 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007243 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Facundo Batista57d56692007-11-16 18:04:14 +00007245 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007248 result = stringlib_rfind_slice(
7249 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7250 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7251 start, end
7252 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
7254 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007255
7256 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257}
7258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007259PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007260 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007262Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264static PyObject *
7265unicode_rindex(PyUnicodeObject *self, PyObject *args)
7266{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007267 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007268 Py_ssize_t start;
7269 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007270 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
Facundo Batista57d56692007-11-16 18:04:14 +00007272 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007275 result = stringlib_rfind_slice(
7276 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7277 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7278 start, end
7279 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
7281 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 if (result < 0) {
7284 PyErr_SetString(PyExc_ValueError, "substring not found");
7285 return NULL;
7286 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007287 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007291 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007293Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007294done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
7296static PyObject *
7297unicode_rjust(PyUnicodeObject *self, PyObject *args)
7298{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007299 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007300 Py_UNICODE fillchar = ' ';
7301
Martin v. Löwis412fb672006-04-13 06:34:32 +00007302 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 return NULL;
7304
Tim Peters7a29bd52001-09-12 03:03:31 +00007305 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 Py_INCREF(self);
7307 return (PyObject*) self;
7308 }
7309
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007310 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311}
7312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007314unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
7316 /* standard clamping */
7317 if (start < 0)
7318 start = 0;
7319 if (end < 0)
7320 end = 0;
7321 if (end > self->length)
7322 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007323 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 /* full slice, return original string */
7325 Py_INCREF(self);
7326 return (PyObject*) self;
7327 }
7328 if (start > end)
7329 start = end;
7330 /* copy slice */
7331 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007332 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
7335PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007336 PyObject *sep,
7337 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338{
7339 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007340
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 s = PyUnicode_FromObject(s);
7342 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007344 if (sep != NULL) {
7345 sep = PyUnicode_FromObject(sep);
7346 if (sep == NULL) {
7347 Py_DECREF(s);
7348 return NULL;
7349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
7351
7352 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7353
7354 Py_DECREF(s);
7355 Py_XDECREF(sep);
7356 return result;
7357}
7358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007360 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361\n\
7362Return a list of the words in S, using sep as the\n\
7363delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007364splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007365whitespace string is a separator and empty strings are\n\
7366removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject*
7369unicode_split(PyUnicodeObject *self, PyObject *args)
7370{
7371 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007372 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Martin v. Löwis18e16552006-02-15 17:27:45 +00007374 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 return NULL;
7376
7377 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007378 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007380 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007382 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383}
7384
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007385PyObject *
7386PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7387{
7388 PyObject* str_obj;
7389 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007390 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007391
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007392 str_obj = PyUnicode_FromObject(str_in);
7393 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007394 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007395 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007396 if (!sep_obj) {
7397 Py_DECREF(str_obj);
7398 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007399 }
7400
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007401 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007402 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7403 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7404 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007405
Fredrik Lundhb9479482006-05-26 17:22:38 +00007406 Py_DECREF(sep_obj);
7407 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007408
7409 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007410}
7411
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007412
7413PyObject *
7414PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7415{
7416 PyObject* str_obj;
7417 PyObject* sep_obj;
7418 PyObject* out;
7419
7420 str_obj = PyUnicode_FromObject(str_in);
7421 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007422 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007423 sep_obj = PyUnicode_FromObject(sep_in);
7424 if (!sep_obj) {
7425 Py_DECREF(str_obj);
7426 return NULL;
7427 }
7428
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007429 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007430 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7431 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7432 );
7433
7434 Py_DECREF(sep_obj);
7435 Py_DECREF(str_obj);
7436
7437 return out;
7438}
7439
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007440PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007441 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007442\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007443Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007445found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446
7447static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007448unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450 return PyUnicode_Partition((PyObject *)self, separator);
7451}
7452
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007453PyDoc_STRVAR(rpartition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007454 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007455\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007456Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007457the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007458separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007459
7460static PyObject*
7461unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7462{
7463 return PyUnicode_RPartition((PyObject *)self, separator);
7464}
7465
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007466PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007467 PyObject *sep,
7468 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007469{
7470 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007471
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007472 s = PyUnicode_FromObject(s);
7473 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007474 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007475 if (sep != NULL) {
7476 sep = PyUnicode_FromObject(sep);
7477 if (sep == NULL) {
7478 Py_DECREF(s);
7479 return NULL;
7480 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007481 }
7482
7483 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7484
7485 Py_DECREF(s);
7486 Py_XDECREF(sep);
7487 return result;
7488}
7489
7490PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007491 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007492\n\
7493Return a list of the words in S, using sep as the\n\
7494delimiter string, starting at the end of the string and\n\
7495working to the front. If maxsplit is given, at most maxsplit\n\
7496splits are done. If sep is not specified, any whitespace string\n\
7497is a separator.");
7498
7499static PyObject*
7500unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7501{
7502 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007504
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007506 return NULL;
7507
7508 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007509 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007510 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007511 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007512 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007513 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007517 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518\n\
7519Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007520Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523static PyObject*
7524unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7525{
Guido van Rossum86662912000-04-11 15:38:46 +00007526 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
Guido van Rossum86662912000-04-11 15:38:46 +00007528 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530
Guido van Rossum86662912000-04-11 15:38:46 +00007531 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532}
7533
7534static
7535PyObject *unicode_str(PyUnicodeObject *self)
7536{
Fred Drakee4315f52000-05-09 19:53:39 +00007537 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538}
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007541 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542\n\
7543Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007544and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007547unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 return fixup(self, fixswapcase);
7550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007553 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554\n\
7555Return a copy of the string S, where all characters have been mapped\n\
7556through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007557Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7558Unmapped characters are left untouched. Characters mapped to None\n\
7559are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007562unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563{
Tim Petersced69f82003-09-16 20:30:58 +00007564 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007565 self->length,
7566 table,
7567 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568}
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007571 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007573Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
7575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007576unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 return fixup(self, fixupper);
7579}
7580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007581PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007582 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583\n\
Georg Brandl98064072008-09-09 19:26:00 +00007584Pad a numeric string S with zeros on the left, to fill a field\n\
7585of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
7587static PyObject *
7588unicode_zfill(PyUnicodeObject *self, PyObject *args)
7589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007590 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 PyUnicodeObject *u;
7592
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t width;
7594 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return NULL;
7596
7597 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007598 if (PyUnicode_CheckExact(self)) {
7599 Py_INCREF(self);
7600 return (PyObject*) self;
7601 }
7602 else
7603 return PyUnicode_FromUnicode(
7604 PyUnicode_AS_UNICODE(self),
7605 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007606 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 }
7608
7609 fill = width - self->length;
7610
7611 u = pad(self, fill, 0, '0');
7612
Walter Dörwald068325e2002-04-15 13:36:47 +00007613 if (u == NULL)
7614 return NULL;
7615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 if (u->str[fill] == '+' || u->str[fill] == '-') {
7617 /* move sign to beginning of string */
7618 u->str[0] = u->str[fill];
7619 u->str[fill] = '0';
7620 }
7621
7622 return (PyObject*) u;
7623}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
7625#if 0
7626static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007627free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007629 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630}
7631#endif
7632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007634 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007636Return True if S starts with the specified prefix, False otherwise.\n\
7637With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007638With optional end, stop comparing S at that position.\n\
7639prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject *
7642unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007643 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644{
Georg Brandl24250812006-06-09 18:45:48 +00007645 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007648 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007649 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
Georg Brandl24250812006-06-09 18:45:48 +00007651 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007652 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7653 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007654 if (PyTuple_Check(subobj)) {
7655 Py_ssize_t i;
7656 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7657 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007658 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007659 if (substring == NULL)
7660 return NULL;
7661 result = tailmatch(self, substring, start, end, -1);
7662 Py_DECREF(substring);
7663 if (result) {
7664 Py_RETURN_TRUE;
7665 }
7666 }
7667 /* nothing matched */
7668 Py_RETURN_FALSE;
7669 }
7670 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007672 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007673 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007675 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676}
7677
7678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007679PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007682Return True if S ends with the specified suffix, False otherwise.\n\
7683With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007684With optional end, stop comparing S at that position.\n\
7685suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
7687static PyObject *
7688unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007689 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Georg Brandl24250812006-06-09 18:45:48 +00007691 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007694 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007695 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Georg Brandl24250812006-06-09 18:45:48 +00007697 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7699 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007700 if (PyTuple_Check(subobj)) {
7701 Py_ssize_t i;
7702 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7703 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007704 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007705 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007706 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007707 result = tailmatch(self, substring, start, end, +1);
7708 Py_DECREF(substring);
7709 if (result) {
7710 Py_RETURN_TRUE;
7711 }
7712 }
7713 Py_RETURN_FALSE;
7714 }
7715 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Georg Brandl24250812006-06-09 18:45:48 +00007719 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007721 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
7724
Eric Smitha9f7d622008-02-17 19:46:49 +00007725/* Implements do_string_format, which is unicode because of stringlib */
7726#include "stringlib/string_format.h"
7727
7728PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007729 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007730\n\
7731");
7732
Eric Smithdc13b792008-05-30 18:10:04 +00007733static PyObject *
7734unicode__format__(PyObject *self, PyObject *args)
7735{
7736 PyObject *format_spec;
7737 PyObject *result = NULL;
7738 PyObject *tmp = NULL;
7739
7740 /* If 2.x, convert format_spec to the same type as value */
7741 /* This is to allow things like u''.format('') */
7742 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7743 goto done;
7744 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7745 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007746 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007747 goto done;
7748 }
7749 tmp = PyObject_Unicode(format_spec);
7750 if (tmp == NULL)
7751 goto done;
7752 format_spec = tmp;
7753
7754 result = _PyUnicode_FormatAdvanced(self,
7755 PyUnicode_AS_UNICODE(format_spec),
7756 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007757 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007758 Py_XDECREF(tmp);
7759 return result;
7760}
7761
Eric Smitha9f7d622008-02-17 19:46:49 +00007762PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007763 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007764\n\
7765");
7766
Robert Schuppenies901c9972008-06-10 10:10:31 +00007767static PyObject *
7768unicode__sizeof__(PyUnicodeObject *v)
7769{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007770 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7771 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007772}
7773
7774PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007775 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007776\n\
7777");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007778
7779static PyObject *
7780unicode_getnewargs(PyUnicodeObject *v)
7781{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007782 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007783}
7784
7785
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786static PyMethodDef unicode_methods[] = {
7787
7788 /* Order is according to common usage: often used methods should
7789 appear first, since lookup is done sequentially. */
7790
Benjamin Peterson332d7212009-09-18 21:14:55 +00007791 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007792 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7793 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007794 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007795 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7796 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7797 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7798 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7799 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7800 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7801 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007802 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007803 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7804 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7805 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007806 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007807 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007808/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7809 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7810 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7811 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007812 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007813 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007814 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007815 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007816 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7817 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7818 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7819 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7820 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7821 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7822 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7823 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7824 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7825 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7826 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7827 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7828 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7829 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007830 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007831 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7832 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7833 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7834 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007835 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007836#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007837 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838#endif
7839
7840#if 0
7841 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007842 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843#endif
7844
Benjamin Peterson857ce152009-01-31 16:29:18 +00007845 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 {NULL, NULL}
7847};
7848
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007849static PyObject *
7850unicode_mod(PyObject *v, PyObject *w)
7851{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007852 if (!PyUnicode_Check(v)) {
7853 Py_INCREF(Py_NotImplemented);
7854 return Py_NotImplemented;
7855 }
7856 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007857}
7858
7859static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007860 0, /*nb_add*/
7861 0, /*nb_subtract*/
7862 0, /*nb_multiply*/
7863 0, /*nb_divide*/
7864 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007865};
7866
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007868 (lenfunc) unicode_length, /* sq_length */
7869 PyUnicode_Concat, /* sq_concat */
7870 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7871 (ssizeargfunc) unicode_getitem, /* sq_item */
7872 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7873 0, /* sq_ass_item */
7874 0, /* sq_ass_slice */
7875 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876};
7877
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007878static PyObject*
7879unicode_subscript(PyUnicodeObject* self, PyObject* item)
7880{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007881 if (PyIndex_Check(item)) {
7882 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007883 if (i == -1 && PyErr_Occurred())
7884 return NULL;
7885 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007886 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007887 return unicode_getitem(self, i);
7888 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007890 Py_UNICODE* source_buf;
7891 Py_UNICODE* result_buf;
7892 PyObject* result;
7893
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007894 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007895 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007896 return NULL;
7897 }
7898
7899 if (slicelength <= 0) {
7900 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007901 } else if (start == 0 && step == 1 && slicelength == self->length &&
7902 PyUnicode_CheckExact(self)) {
7903 Py_INCREF(self);
7904 return (PyObject *)self;
7905 } else if (step == 1) {
7906 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007907 } else {
7908 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007909 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7910 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007911
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007912 if (result_buf == NULL)
7913 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007914
7915 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7916 result_buf[i] = source_buf[cur];
7917 }
Tim Petersced69f82003-09-16 20:30:58 +00007918
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007919 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007920 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007921 return result;
7922 }
7923 } else {
7924 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7925 return NULL;
7926 }
7927}
7928
7929static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007930 (lenfunc)unicode_length, /* mp_length */
7931 (binaryfunc)unicode_subscript, /* mp_subscript */
7932 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007933};
7934
Martin v. Löwis18e16552006-02-15 17:27:45 +00007935static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007937 Py_ssize_t index,
7938 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939{
7940 if (index != 0) {
7941 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007942 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 return -1;
7944 }
7945 *ptr = (void *) self->str;
7946 return PyUnicode_GET_DATA_SIZE(self);
7947}
7948
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949static Py_ssize_t
7950unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007951 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952{
7953 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007954 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 return -1;
7956}
7957
7958static int
7959unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007960 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961{
7962 if (lenp)
7963 *lenp = PyUnicode_GET_DATA_SIZE(self);
7964 return 1;
7965}
7966
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007967static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007969 Py_ssize_t index,
7970 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
7972 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007973
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 if (index != 0) {
7975 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007976 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 return -1;
7978 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007979 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007981 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007982 *ptr = (void *) PyString_AS_STRING(str);
7983 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984}
7985
7986/* Helpers for PyUnicode_Format() */
7987
7988static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007989getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007993 (*p_argidx)++;
7994 if (arglen < 0)
7995 return args;
7996 else
7997 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 }
7999 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008000 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 return NULL;
8002}
8003
8004#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008005#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008007#define F_ALT (1<<3)
8008#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008011strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 register Py_ssize_t i;
8014 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008016 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 return len;
8019}
8020
Neal Norwitzfc76d632006-01-10 06:03:13 +00008021static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008022longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8023{
Tim Peters15231542006-02-16 01:08:01 +00008024 Py_ssize_t result;
8025
Neal Norwitzfc76d632006-01-10 06:03:13 +00008026 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008027 result = strtounicode(buffer, (char *)buffer);
8028 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008029}
8030
Guido van Rossum078151d2002-08-11 04:24:12 +00008031/* XXX To save some code duplication, formatfloat/long/int could have been
8032 shared with stringobject.c, converting from 8-bit to Unicode after the
8033 formatting is done. */
8034
Mark Dickinson18cfada2009-11-23 18:46:41 +00008035/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8036
8037static PyObject *
8038formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008040 char *p;
8041 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008043
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 x = PyFloat_AsDouble(v);
8045 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008046 return NULL;
8047
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008049 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008050
Mark Dickinson18cfada2009-11-23 18:46:41 +00008051 p = PyOS_double_to_string(x, type, prec,
8052 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8053 if (p == NULL)
8054 return NULL;
8055 result = PyUnicode_FromStringAndSize(p, strlen(p));
8056 PyMem_Free(p);
8057 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058}
8059
Tim Peters38fd5b62000-09-21 05:43:11 +00008060static PyObject*
8061formatlong(PyObject *val, int flags, int prec, int type)
8062{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008063 char *buf;
8064 int i, len;
8065 PyObject *str; /* temporary string object. */
8066 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008067
Benjamin Peterson857ce152009-01-31 16:29:18 +00008068 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8069 if (!str)
8070 return NULL;
8071 result = _PyUnicode_New(len);
8072 if (!result) {
8073 Py_DECREF(str);
8074 return NULL;
8075 }
8076 for (i = 0; i < len; i++)
8077 result->str[i] = buf[i];
8078 result->str[len] = 0;
8079 Py_DECREF(str);
8080 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008081}
8082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083static int
8084formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008085 size_t buflen,
8086 int flags,
8087 int prec,
8088 int type,
8089 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008091 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008092 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8093 * + 1 + 1
8094 * = 24
8095 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008096 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008097 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 long x;
8099
8100 x = PyInt_AsLong(v);
8101 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008102 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008103 if (x < 0 && type == 'u') {
8104 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008105 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008106 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8107 sign = "-";
8108 else
8109 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008111 prec = 1;
8112
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008113 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8114 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008115 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008116 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008117 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008118 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008119 return -1;
8120 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008121
8122 if ((flags & F_ALT) &&
8123 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008124 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008125 * of issues that cause pain:
8126 * - when 0 is being converted, the C standard leaves off
8127 * the '0x' or '0X', which is inconsistent with other
8128 * %#x/%#X conversions and inconsistent with Python's
8129 * hex() function
8130 * - there are platforms that violate the standard and
8131 * convert 0 with the '0x' or '0X'
8132 * (Metrowerks, Compaq Tru64)
8133 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008134 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008135 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008136 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008137 * We can achieve the desired consistency by inserting our
8138 * own '0x' or '0X' prefix, and substituting %x/%X in place
8139 * of %#x/%#X.
8140 *
8141 * Note that this is the same approach as used in
8142 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008143 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008144 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8145 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008146 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008147 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008148 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8149 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008150 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008151 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008152 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008153 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008155 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156}
8157
8158static int
8159formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008160 size_t buflen,
8161 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008163 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008164 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008165 if (PyUnicode_GET_SIZE(v) != 1)
8166 goto onError;
8167 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008170 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008171 if (PyString_GET_SIZE(v) != 1)
8172 goto onError;
8173 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175
8176 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008177 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008179 x = PyInt_AsLong(v);
8180 if (x == -1 && PyErr_Occurred())
8181 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008182#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008183 if (x < 0 || x > 0x10ffff) {
8184 PyErr_SetString(PyExc_OverflowError,
8185 "%c arg not in range(0x110000) "
8186 "(wide Python build)");
8187 return -1;
8188 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008189#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008190 if (x < 0 || x > 0xffff) {
8191 PyErr_SetString(PyExc_OverflowError,
8192 "%c arg not in range(0x10000) "
8193 "(narrow Python build)");
8194 return -1;
8195 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008196#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008197 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 }
8199 buf[1] = '\0';
8200 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008201
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008202 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008203 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008204 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008205 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206}
8207
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008208/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8209
Mark Dickinson18cfada2009-11-23 18:46:41 +00008210 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008211 chars are formatted. XXX This is a magic number. Each formatting
8212 routine does bounds checking to ensure no overflow, but a better
8213 solution may be to malloc a buffer of appropriate size for each
8214 format. For now, the current solution is sufficient.
8215*/
8216#define FORMATBUFLEN (size_t)120
8217
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008219 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
8221 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 int args_owned = 0;
8224 PyUnicodeObject *result = NULL;
8225 PyObject *dict = NULL;
8226 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008229 PyErr_BadInternalCall();
8230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
8232 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008233 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 fmt = PyUnicode_AS_UNICODE(uformat);
8236 fmtcnt = PyUnicode_GET_SIZE(uformat);
8237
8238 reslen = rescnt = fmtcnt + 100;
8239 result = _PyUnicode_New(reslen);
8240 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 res = PyUnicode_AS_UNICODE(result);
8243
8244 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008245 arglen = PyTuple_Size(args);
8246 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
8248 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008249 arglen = -1;
8250 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 }
Christian Heimese93237d2007-12-19 02:37:44 +00008252 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008253 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008254 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255
8256 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 if (*fmt != '%') {
8258 if (--rescnt < 0) {
8259 rescnt = fmtcnt + 100;
8260 reslen += rescnt;
8261 if (_PyUnicode_Resize(&result, reslen) < 0)
8262 goto onError;
8263 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8264 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008265 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008266 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008267 }
8268 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008269 /* Got a format specifier */
8270 int flags = 0;
8271 Py_ssize_t width = -1;
8272 int prec = -1;
8273 Py_UNICODE c = '\0';
8274 Py_UNICODE fill;
8275 int isnumok;
8276 PyObject *v = NULL;
8277 PyObject *temp = NULL;
8278 Py_UNICODE *pbuf;
8279 Py_UNICODE sign;
8280 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008281 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282
8283 fmt++;
8284 if (*fmt == '(') {
8285 Py_UNICODE *keystart;
8286 Py_ssize_t keylen;
8287 PyObject *key;
8288 int pcount = 1;
8289
8290 if (dict == NULL) {
8291 PyErr_SetString(PyExc_TypeError,
8292 "format requires a mapping");
8293 goto onError;
8294 }
8295 ++fmt;
8296 --fmtcnt;
8297 keystart = fmt;
8298 /* Skip over balanced parentheses */
8299 while (pcount > 0 && --fmtcnt >= 0) {
8300 if (*fmt == ')')
8301 --pcount;
8302 else if (*fmt == '(')
8303 ++pcount;
8304 fmt++;
8305 }
8306 keylen = fmt - keystart - 1;
8307 if (fmtcnt < 0 || pcount > 0) {
8308 PyErr_SetString(PyExc_ValueError,
8309 "incomplete format key");
8310 goto onError;
8311 }
8312#if 0
8313 /* keys are converted to strings using UTF-8 and
8314 then looked up since Python uses strings to hold
8315 variables names etc. in its namespaces and we
8316 wouldn't want to break common idioms. */
8317 key = PyUnicode_EncodeUTF8(keystart,
8318 keylen,
8319 NULL);
8320#else
8321 key = PyUnicode_FromUnicode(keystart, keylen);
8322#endif
8323 if (key == NULL)
8324 goto onError;
8325 if (args_owned) {
8326 Py_DECREF(args);
8327 args_owned = 0;
8328 }
8329 args = PyObject_GetItem(dict, key);
8330 Py_DECREF(key);
8331 if (args == NULL) {
8332 goto onError;
8333 }
8334 args_owned = 1;
8335 arglen = -1;
8336 argidx = -2;
8337 }
8338 while (--fmtcnt >= 0) {
8339 switch (c = *fmt++) {
8340 case '-': flags |= F_LJUST; continue;
8341 case '+': flags |= F_SIGN; continue;
8342 case ' ': flags |= F_BLANK; continue;
8343 case '#': flags |= F_ALT; continue;
8344 case '0': flags |= F_ZERO; continue;
8345 }
8346 break;
8347 }
8348 if (c == '*') {
8349 v = getnextarg(args, arglen, &argidx);
8350 if (v == NULL)
8351 goto onError;
8352 if (!PyInt_Check(v)) {
8353 PyErr_SetString(PyExc_TypeError,
8354 "* wants int");
8355 goto onError;
8356 }
8357 width = PyInt_AsLong(v);
8358 if (width < 0) {
8359 flags |= F_LJUST;
8360 width = -width;
8361 }
8362 if (--fmtcnt >= 0)
8363 c = *fmt++;
8364 }
8365 else if (c >= '0' && c <= '9') {
8366 width = c - '0';
8367 while (--fmtcnt >= 0) {
8368 c = *fmt++;
8369 if (c < '0' || c > '9')
8370 break;
8371 if ((width*10) / 10 != width) {
8372 PyErr_SetString(PyExc_ValueError,
8373 "width too big");
8374 goto onError;
8375 }
8376 width = width*10 + (c - '0');
8377 }
8378 }
8379 if (c == '.') {
8380 prec = 0;
8381 if (--fmtcnt >= 0)
8382 c = *fmt++;
8383 if (c == '*') {
8384 v = getnextarg(args, arglen, &argidx);
8385 if (v == NULL)
8386 goto onError;
8387 if (!PyInt_Check(v)) {
8388 PyErr_SetString(PyExc_TypeError,
8389 "* wants int");
8390 goto onError;
8391 }
8392 prec = PyInt_AsLong(v);
8393 if (prec < 0)
8394 prec = 0;
8395 if (--fmtcnt >= 0)
8396 c = *fmt++;
8397 }
8398 else if (c >= '0' && c <= '9') {
8399 prec = c - '0';
8400 while (--fmtcnt >= 0) {
8401 c = Py_CHARMASK(*fmt++);
8402 if (c < '0' || c > '9')
8403 break;
8404 if ((prec*10) / 10 != prec) {
8405 PyErr_SetString(PyExc_ValueError,
8406 "prec too big");
8407 goto onError;
8408 }
8409 prec = prec*10 + (c - '0');
8410 }
8411 }
8412 } /* prec */
8413 if (fmtcnt >= 0) {
8414 if (c == 'h' || c == 'l' || c == 'L') {
8415 if (--fmtcnt >= 0)
8416 c = *fmt++;
8417 }
8418 }
8419 if (fmtcnt < 0) {
8420 PyErr_SetString(PyExc_ValueError,
8421 "incomplete format");
8422 goto onError;
8423 }
8424 if (c != '%') {
8425 v = getnextarg(args, arglen, &argidx);
8426 if (v == NULL)
8427 goto onError;
8428 }
8429 sign = 0;
8430 fill = ' ';
8431 switch (c) {
8432
8433 case '%':
8434 pbuf = formatbuf;
8435 /* presume that buffer length is at least 1 */
8436 pbuf[0] = '%';
8437 len = 1;
8438 break;
8439
8440 case 's':
8441 case 'r':
8442 if (PyUnicode_Check(v) && c == 's') {
8443 temp = v;
8444 Py_INCREF(temp);
8445 }
8446 else {
8447 PyObject *unicode;
8448 if (c == 's')
8449 temp = PyObject_Unicode(v);
8450 else
8451 temp = PyObject_Repr(v);
8452 if (temp == NULL)
8453 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008454 if (PyUnicode_Check(temp))
8455 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008456 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008457 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008458 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8459 PyString_GET_SIZE(temp),
8460 NULL,
8461 "strict");
8462 Py_DECREF(temp);
8463 temp = unicode;
8464 if (temp == NULL)
8465 goto onError;
8466 }
8467 else {
8468 Py_DECREF(temp);
8469 PyErr_SetString(PyExc_TypeError,
8470 "%s argument has non-string str()");
8471 goto onError;
8472 }
8473 }
8474 pbuf = PyUnicode_AS_UNICODE(temp);
8475 len = PyUnicode_GET_SIZE(temp);
8476 if (prec >= 0 && len > prec)
8477 len = prec;
8478 break;
8479
8480 case 'i':
8481 case 'd':
8482 case 'u':
8483 case 'o':
8484 case 'x':
8485 case 'X':
8486 if (c == 'i')
8487 c = 'd';
8488 isnumok = 0;
8489 if (PyNumber_Check(v)) {
8490 PyObject *iobj=NULL;
8491
8492 if (PyInt_Check(v) || (PyLong_Check(v))) {
8493 iobj = v;
8494 Py_INCREF(iobj);
8495 }
8496 else {
8497 iobj = PyNumber_Int(v);
8498 if (iobj==NULL) iobj = PyNumber_Long(v);
8499 }
8500 if (iobj!=NULL) {
8501 if (PyInt_Check(iobj)) {
8502 isnumok = 1;
8503 pbuf = formatbuf;
8504 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8505 flags, prec, c, iobj);
8506 Py_DECREF(iobj);
8507 if (len < 0)
8508 goto onError;
8509 sign = 1;
8510 }
8511 else if (PyLong_Check(iobj)) {
8512 isnumok = 1;
8513 temp = formatlong(iobj, flags, prec, c);
8514 Py_DECREF(iobj);
8515 if (!temp)
8516 goto onError;
8517 pbuf = PyUnicode_AS_UNICODE(temp);
8518 len = PyUnicode_GET_SIZE(temp);
8519 sign = 1;
8520 }
8521 else {
8522 Py_DECREF(iobj);
8523 }
8524 }
8525 }
8526 if (!isnumok) {
8527 PyErr_Format(PyExc_TypeError,
8528 "%%%c format: a number is required, "
8529 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8530 goto onError;
8531 }
8532 if (flags & F_ZERO)
8533 fill = '0';
8534 break;
8535
8536 case 'e':
8537 case 'E':
8538 case 'f':
8539 case 'F':
8540 case 'g':
8541 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008542 temp = formatfloat(v, flags, prec, c);
8543 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008544 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008545 pbuf = PyUnicode_AS_UNICODE(temp);
8546 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008547 sign = 1;
8548 if (flags & F_ZERO)
8549 fill = '0';
8550 break;
8551
8552 case 'c':
8553 pbuf = formatbuf;
8554 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8555 if (len < 0)
8556 goto onError;
8557 break;
8558
8559 default:
8560 PyErr_Format(PyExc_ValueError,
8561 "unsupported format character '%c' (0x%x) "
8562 "at index %zd",
8563 (31<=c && c<=126) ? (char)c : '?',
8564 (int)c,
8565 (Py_ssize_t)(fmt - 1 -
8566 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008567 goto onError;
8568 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008569 if (sign) {
8570 if (*pbuf == '-' || *pbuf == '+') {
8571 sign = *pbuf++;
8572 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008573 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008574 else if (flags & F_SIGN)
8575 sign = '+';
8576 else if (flags & F_BLANK)
8577 sign = ' ';
8578 else
8579 sign = 0;
8580 }
8581 if (width < len)
8582 width = len;
8583 if (rescnt - (sign != 0) < width) {
8584 reslen -= rescnt;
8585 rescnt = width + fmtcnt + 100;
8586 reslen += rescnt;
8587 if (reslen < 0) {
8588 Py_XDECREF(temp);
8589 PyErr_NoMemory();
8590 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008591 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008592 if (_PyUnicode_Resize(&result, reslen) < 0) {
8593 Py_XDECREF(temp);
8594 goto onError;
8595 }
8596 res = PyUnicode_AS_UNICODE(result)
8597 + reslen - rescnt;
8598 }
8599 if (sign) {
8600 if (fill != ' ')
8601 *res++ = sign;
8602 rescnt--;
8603 if (width > len)
8604 width--;
8605 }
8606 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8607 assert(pbuf[0] == '0');
8608 assert(pbuf[1] == c);
8609 if (fill != ' ') {
8610 *res++ = *pbuf++;
8611 *res++ = *pbuf++;
8612 }
8613 rescnt -= 2;
8614 width -= 2;
8615 if (width < 0)
8616 width = 0;
8617 len -= 2;
8618 }
8619 if (width > len && !(flags & F_LJUST)) {
8620 do {
8621 --rescnt;
8622 *res++ = fill;
8623 } while (--width > len);
8624 }
8625 if (fill == ' ') {
8626 if (sign)
8627 *res++ = sign;
8628 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8629 assert(pbuf[0] == '0');
8630 assert(pbuf[1] == c);
8631 *res++ = *pbuf++;
8632 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008633 }
8634 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008635 Py_UNICODE_COPY(res, pbuf, len);
8636 res += len;
8637 rescnt -= len;
8638 while (--width >= len) {
8639 --rescnt;
8640 *res++ = ' ';
8641 }
8642 if (dict && (argidx < arglen) && c != '%') {
8643 PyErr_SetString(PyExc_TypeError,
8644 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008645 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008646 goto onError;
8647 }
8648 Py_XDECREF(temp);
8649 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 } /* until end */
8651 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008652 PyErr_SetString(PyExc_TypeError,
8653 "not all arguments converted during string formatting");
8654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 }
8656
Thomas Woutersa96affe2006-03-12 00:29:36 +00008657 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008660 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 }
8662 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 return (PyObject *)result;
8664
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008665 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 Py_XDECREF(result);
8667 Py_DECREF(uformat);
8668 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008669 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 }
8671 return NULL;
8672}
8673
8674static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008675 (readbufferproc) unicode_buffer_getreadbuf,
8676 (writebufferproc) unicode_buffer_getwritebuf,
8677 (segcountproc) unicode_buffer_getsegcount,
8678 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679};
8680
Jeremy Hylton938ace62002-07-17 16:30:39 +00008681static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008682unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8683
Tim Peters6d6c1a32001-08-02 04:15:00 +00008684static PyObject *
8685unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8686{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008687 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008688 static char *kwlist[] = {"string", "encoding", "errors", 0};
8689 char *encoding = NULL;
8690 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008691
Benjamin Peterson857ce152009-01-31 16:29:18 +00008692 if (type != &PyUnicode_Type)
8693 return unicode_subtype_new(type, args, kwds);
8694 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008695 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008696 return NULL;
8697 if (x == NULL)
8698 return (PyObject *)_PyUnicode_New(0);
8699 if (encoding == NULL && errors == NULL)
8700 return PyObject_Unicode(x);
8701 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008702 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008703}
8704
Guido van Rossume023fe02001-08-30 03:12:59 +00008705static PyObject *
8706unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8707{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008708 PyUnicodeObject *tmp, *pnew;
8709 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008710
Benjamin Peterson857ce152009-01-31 16:29:18 +00008711 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8712 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8713 if (tmp == NULL)
8714 return NULL;
8715 assert(PyUnicode_Check(tmp));
8716 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8717 if (pnew == NULL) {
8718 Py_DECREF(tmp);
8719 return NULL;
8720 }
8721 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8722 if (pnew->str == NULL) {
8723 _Py_ForgetReference((PyObject *)pnew);
8724 PyObject_Del(pnew);
8725 Py_DECREF(tmp);
8726 return PyErr_NoMemory();
8727 }
8728 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8729 pnew->length = n;
8730 pnew->hash = tmp->hash;
8731 Py_DECREF(tmp);
8732 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008733}
8734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008735PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008736 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008737\n\
8738Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008739encoding defaults to the current default string encoding.\n\
8740errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008741
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008743 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008744 "unicode", /* tp_name */
8745 sizeof(PyUnicodeObject), /* tp_size */
8746 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008748 (destructor)unicode_dealloc, /* tp_dealloc */
8749 0, /* tp_print */
8750 0, /* tp_getattr */
8751 0, /* tp_setattr */
8752 0, /* tp_compare */
8753 unicode_repr, /* tp_repr */
8754 &unicode_as_number, /* tp_as_number */
8755 &unicode_as_sequence, /* tp_as_sequence */
8756 &unicode_as_mapping, /* tp_as_mapping */
8757 (hashfunc) unicode_hash, /* tp_hash*/
8758 0, /* tp_call*/
8759 (reprfunc) unicode_str, /* tp_str */
8760 PyObject_GenericGetAttr, /* tp_getattro */
8761 0, /* tp_setattro */
8762 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008763 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008764 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008765 unicode_doc, /* tp_doc */
8766 0, /* tp_traverse */
8767 0, /* tp_clear */
8768 PyUnicode_RichCompare, /* tp_richcompare */
8769 0, /* tp_weaklistoffset */
8770 0, /* tp_iter */
8771 0, /* tp_iternext */
8772 unicode_methods, /* tp_methods */
8773 0, /* tp_members */
8774 0, /* tp_getset */
8775 &PyBaseString_Type, /* tp_base */
8776 0, /* tp_dict */
8777 0, /* tp_descr_get */
8778 0, /* tp_descr_set */
8779 0, /* tp_dictoffset */
8780 0, /* tp_init */
8781 0, /* tp_alloc */
8782 unicode_new, /* tp_new */
8783 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784};
8785
8786/* Initialize the Unicode implementation */
8787
Thomas Wouters78890102000-07-22 19:25:51 +00008788void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008790 int i;
8791
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008792 /* XXX - move this array to unicodectype.c ? */
8793 Py_UNICODE linebreak[] = {
8794 0x000A, /* LINE FEED */
8795 0x000D, /* CARRIAGE RETURN */
8796 0x001C, /* FILE SEPARATOR */
8797 0x001D, /* GROUP SEPARATOR */
8798 0x001E, /* RECORD SEPARATOR */
8799 0x0085, /* NEXT LINE */
8800 0x2028, /* LINE SEPARATOR */
8801 0x2029, /* PARAGRAPH SEPARATOR */
8802 };
8803
Fred Drakee4315f52000-05-09 19:53:39 +00008804 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008805 free_list = NULL;
8806 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008808 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008809 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008810
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008811 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008812 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008813 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008814 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008815 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008816
8817 /* initialize the linebreak bloom filter */
8818 bloom_linebreak = make_bloom_mask(
8819 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8820 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008821
8822 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823}
8824
8825/* Finalize the Unicode implementation */
8826
Christian Heimes3b718a72008-02-14 12:47:33 +00008827int
8828PyUnicode_ClearFreeList(void)
8829{
8830 int freelist_size = numfree;
8831 PyUnicodeObject *u;
8832
8833 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008834 PyUnicodeObject *v = u;
8835 u = *(PyUnicodeObject **)u;
8836 if (v->str)
8837 PyObject_DEL(v->str);
8838 Py_XDECREF(v->defenc);
8839 PyObject_Del(v);
8840 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008841 }
8842 free_list = NULL;
8843 assert(numfree == 0);
8844 return freelist_size;
8845}
8846
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847void
Thomas Wouters78890102000-07-22 19:25:51 +00008848_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008850 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008852 Py_XDECREF(unicode_empty);
8853 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008854
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008855 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008856 if (unicode_latin1[i]) {
8857 Py_DECREF(unicode_latin1[i]);
8858 unicode_latin1[i] = NULL;
8859 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008860 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008861 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008863
Anthony Baxterac6bd462006-04-13 02:06:09 +00008864#ifdef __cplusplus
8865}
8866#endif