blob: 5fbd24d5fc820d06be0b172051e9ccd7ef0af653 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550#ifdef HAVE_WCHAR_H
551
Mark Dickinson6b265f12009-03-18 16:07:26 +0000552#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
553# define CONVERT_WCHAR_TO_SURROGATES
554#endif
555
556#ifdef CONVERT_WCHAR_TO_SURROGATES
557
558/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
559 to convert from UTF32 to UTF16. */
560
561PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
562 Py_ssize_t size)
563{
564 PyUnicodeObject *unicode;
565 register Py_ssize_t i;
566 Py_ssize_t alloc;
567 const wchar_t *orig_w;
568
569 if (w == NULL) {
570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
574 alloc = size;
575 orig_w = w;
576 for (i = size; i > 0; i--) {
577 if (*w > 0xFFFF)
578 alloc++;
579 w++;
580 }
581 w = orig_w;
582 unicode = _PyUnicode_New(alloc);
583 if (!unicode)
584 return NULL;
585
586 /* Copy the wchar_t data into the new object */
587 {
588 register Py_UNICODE *u;
589 u = PyUnicode_AS_UNICODE(unicode);
590 for (i = size; i > 0; i--) {
591 if (*w > 0xFFFF) {
592 wchar_t ordinal = *w++;
593 ordinal -= 0x10000;
594 *u++ = 0xD800 | (ordinal >> 10);
595 *u++ = 0xDC00 | (ordinal & 0x3FF);
596 }
597 else
598 *u++ = *w++;
599 }
600 }
601 return (PyObject *)unicode;
602}
603
604#else
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000607 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
609 PyUnicodeObject *unicode;
610
611 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000612 PyErr_BadInternalCall();
613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 }
615
616 unicode = _PyUnicode_New(size);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621#ifdef HAVE_USABLE_WCHAR_T
622 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000623#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000625 register Py_UNICODE *u;
626 register Py_ssize_t i;
627 u = PyUnicode_AS_UNICODE(unicode);
628 for (i = size; i > 0; i--)
629 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630 }
631#endif
632
633 return (PyObject *)unicode;
634}
635
Mark Dickinson6b265f12009-03-18 16:07:26 +0000636#endif /* CONVERT_WCHAR_TO_SURROGATES */
637
638#undef CONVERT_WCHAR_TO_SURROGATES
639
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640static void
641makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
642{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000643 *fmt++ = '%';
644 if (width) {
645 if (zeropad)
646 *fmt++ = '0';
647 fmt += sprintf(fmt, "%d", width);
648 }
649 if (precision)
650 fmt += sprintf(fmt, ".%d", precision);
651 if (longflag)
652 *fmt++ = 'l';
653 else if (size_tflag) {
654 char *f = PY_FORMAT_SIZE_T;
655 while (*f)
656 *fmt++ = *f++;
657 }
658 *fmt++ = c;
659 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000660}
661
662#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
663
664PyObject *
665PyUnicode_FromFormatV(const char *format, va_list vargs)
666{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000667 va_list count;
668 Py_ssize_t callcount = 0;
669 PyObject **callresults = NULL;
670 PyObject **callresult = NULL;
671 Py_ssize_t n = 0;
672 int width = 0;
673 int precision = 0;
674 int zeropad;
675 const char* f;
676 Py_UNICODE *s;
677 PyObject *string;
678 /* used by sprintf */
679 char buffer[21];
680 /* use abuffer instead of buffer, if we need more space
681 * (which can happen if there's a format specifier with width). */
682 char *abuffer = NULL;
683 char *realbuffer;
684 Py_ssize_t abuffersize = 0;
685 char fmt[60]; /* should be enough for %0width.precisionld */
686 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000687
688#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000689 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000690#else
691#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000692 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000693#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000694 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000695#endif
696#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000697 /* step 1: count the number of %S/%R/%s format specifications
698 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
699 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000700 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000701 if (*f == '%') {
702 if (*(f+1)=='%')
703 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000704 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000705 ++callcount;
706 while (isdigit((unsigned)*f))
707 width = (width*10) + *f++ - '0';
708 while (*++f && *f != '%' && !isalpha((unsigned)*f))
709 ;
710 if (*f == 's')
711 ++callcount;
712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000713 }
714 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000715 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000716 if (callcount) {
717 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
718 if (!callresults) {
719 PyErr_NoMemory();
720 return NULL;
721 }
722 callresult = callresults;
723 }
724 /* step 3: figure out how large a buffer we need */
725 for (f = format; *f; f++) {
726 if (*f == '%') {
727 const char* p = f;
728 width = 0;
729 while (isdigit((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 while (*++f && *f != '%' && !isalpha((unsigned)*f))
732 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
735 * they don't affect the amount of space we reserve.
736 */
737 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000738 (f[1] == 'd' || f[1] == 'u'))
739 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000740
Benjamin Peterson857ce152009-01-31 16:29:18 +0000741 switch (*f) {
742 case 'c':
743 (void)va_arg(count, int);
744 /* fall through... */
745 case '%':
746 n++;
747 break;
748 case 'd': case 'u': case 'i': case 'x':
749 (void) va_arg(count, int);
750 /* 20 bytes is enough to hold a 64-bit
751 integer. Decimal takes the most space.
752 This isn't enough for octal.
753 If a width is specified we need more
754 (which we allocate later). */
755 if (width < 20)
756 width = 20;
757 n += width;
758 if (abuffersize < width)
759 abuffersize = width;
760 break;
761 case 's':
762 {
763 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000764 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000765 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
766 if (!str)
767 goto fail;
768 n += PyUnicode_GET_SIZE(str);
769 /* Remember the str and switch to the next slot */
770 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000771 break;
772 }
773 case 'U':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 assert(obj && PyUnicode_Check(obj));
777 n += PyUnicode_GET_SIZE(obj);
778 break;
779 }
780 case 'V':
781 {
782 PyObject *obj = va_arg(count, PyObject *);
783 const char *str = va_arg(count, const char *);
784 assert(obj || str);
785 assert(!obj || PyUnicode_Check(obj));
786 if (obj)
787 n += PyUnicode_GET_SIZE(obj);
788 else
789 n += strlen(str);
790 break;
791 }
792 case 'S':
793 {
794 PyObject *obj = va_arg(count, PyObject *);
795 PyObject *str;
796 assert(obj);
797 str = PyObject_Str(obj);
798 if (!str)
799 goto fail;
800 n += PyUnicode_GET_SIZE(str);
801 /* Remember the str and switch to the next slot */
802 *callresult++ = str;
803 break;
804 }
805 case 'R':
806 {
807 PyObject *obj = va_arg(count, PyObject *);
808 PyObject *repr;
809 assert(obj);
810 repr = PyObject_Repr(obj);
811 if (!repr)
812 goto fail;
813 n += PyUnicode_GET_SIZE(repr);
814 /* Remember the repr and switch to the next slot */
815 *callresult++ = repr;
816 break;
817 }
818 case 'p':
819 (void) va_arg(count, int);
820 /* maximum 64-bit pointer representation:
821 * 0xffffffffffffffff
822 * so 19 characters is enough.
823 * XXX I count 18 -- what's the extra for?
824 */
825 n += 19;
826 break;
827 default:
828 /* if we stumble upon an unknown
829 formatting code, copy the rest of
830 the format string to the output
831 string. (we cannot just skip the
832 code, since there's no way to know
833 what's in the argument list) */
834 n += strlen(p);
835 goto expand;
836 }
837 } else
838 n++;
839 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000840 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000841 if (abuffersize > 20) {
842 abuffer = PyObject_Malloc(abuffersize);
843 if (!abuffer) {
844 PyErr_NoMemory();
845 goto fail;
846 }
847 realbuffer = abuffer;
848 }
849 else
850 realbuffer = buffer;
851 /* step 4: fill the buffer */
852 /* Since we've analyzed how much space we need for the worst case,
853 we don't have to resize the string.
854 There can be no errors beyond this point. */
855 string = PyUnicode_FromUnicode(NULL, n);
856 if (!string)
857 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000858
Benjamin Peterson857ce152009-01-31 16:29:18 +0000859 s = PyUnicode_AS_UNICODE(string);
860 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000861
Benjamin Peterson857ce152009-01-31 16:29:18 +0000862 for (f = format; *f; f++) {
863 if (*f == '%') {
864 const char* p = f++;
865 int longflag = 0;
866 int size_tflag = 0;
867 zeropad = (*f == '0');
868 /* parse the width.precision part */
869 width = 0;
870 while (isdigit((unsigned)*f))
871 width = (width*10) + *f++ - '0';
872 precision = 0;
873 if (*f == '.') {
874 f++;
875 while (isdigit((unsigned)*f))
876 precision = (precision*10) + *f++ - '0';
877 }
878 /* handle the long flag, but only for %ld and %lu.
879 others can be added when necessary. */
880 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
881 longflag = 1;
882 ++f;
883 }
884 /* handle the size_t flag. */
885 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
886 size_tflag = 1;
887 ++f;
888 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000889
Benjamin Peterson857ce152009-01-31 16:29:18 +0000890 switch (*f) {
891 case 'c':
892 *s++ = va_arg(vargs, int);
893 break;
894 case 'd':
895 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
896 if (longflag)
897 sprintf(realbuffer, fmt, va_arg(vargs, long));
898 else if (size_tflag)
899 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
900 else
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 'u':
905 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
906 if (longflag)
907 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
908 else if (size_tflag)
909 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
910 else
911 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
912 appendstring(realbuffer);
913 break;
914 case 'i':
915 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
916 sprintf(realbuffer, fmt, va_arg(vargs, int));
917 appendstring(realbuffer);
918 break;
919 case 'x':
920 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
921 sprintf(realbuffer, fmt, va_arg(vargs, int));
922 appendstring(realbuffer);
923 break;
924 case 's':
925 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000926 /* unused, since we already have the result */
927 (void) va_arg(vargs, char *);
928 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
929 PyUnicode_GET_SIZE(*callresult));
930 s += PyUnicode_GET_SIZE(*callresult);
931 /* We're done with the unicode()/repr() => forget it */
932 Py_DECREF(*callresult);
933 /* switch to next unicode()/repr() result */
934 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000935 break;
936 }
937 case 'U':
938 {
939 PyObject *obj = va_arg(vargs, PyObject *);
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 break;
944 }
945 case 'V':
946 {
947 PyObject *obj = va_arg(vargs, PyObject *);
948 const char *str = va_arg(vargs, const char *);
949 if (obj) {
950 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
951 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
952 s += size;
953 } else {
954 appendstring(str);
955 }
956 break;
957 }
958 case 'S':
959 case 'R':
960 {
961 Py_UNICODE *ucopy;
962 Py_ssize_t usize;
963 Py_ssize_t upos;
964 /* unused, since we already have the result */
965 (void) va_arg(vargs, PyObject *);
966 ucopy = PyUnicode_AS_UNICODE(*callresult);
967 usize = PyUnicode_GET_SIZE(*callresult);
968 for (upos = 0; upos<usize;)
969 *s++ = ucopy[upos++];
970 /* We're done with the unicode()/repr() => forget it */
971 Py_DECREF(*callresult);
972 /* switch to next unicode()/repr() result */
973 ++callresult;
974 break;
975 }
976 case 'p':
977 sprintf(buffer, "%p", va_arg(vargs, void*));
978 /* %p is ill-defined: ensure leading 0x. */
979 if (buffer[1] == 'X')
980 buffer[1] = 'x';
981 else if (buffer[1] != 'x') {
982 memmove(buffer+2, buffer, strlen(buffer)+1);
983 buffer[0] = '0';
984 buffer[1] = 'x';
985 }
986 appendstring(buffer);
987 break;
988 case '%':
989 *s++ = '%';
990 break;
991 default:
992 appendstring(p);
993 goto end;
994 }
995 } else
996 *s++ = *f;
997 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000998
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults)
1001 PyObject_Free(callresults);
1002 if (abuffer)
1003 PyObject_Free(abuffer);
1004 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1005 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001006 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001007 if (callresults) {
1008 PyObject **callresult2 = callresults;
1009 while (callresult2 < callresult) {
1010 Py_DECREF(*callresult2);
1011 ++callresult2;
1012 }
1013 PyObject_Free(callresults);
1014 }
1015 if (abuffer)
1016 PyObject_Free(abuffer);
1017 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018}
1019
1020#undef appendstring
1021
1022PyObject *
1023PyUnicode_FromFormat(const char *format, ...)
1024{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001025 PyObject* ret;
1026 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027
1028#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001029 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001030#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001031 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001032#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001033 ret = PyUnicode_FromFormatV(format, vargs);
1034 va_end(vargs);
1035 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001036}
1037
Martin v. Löwis18e16552006-02-15 17:27:45 +00001038Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001039 wchar_t *w,
1040 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041{
1042 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001043 PyErr_BadInternalCall();
1044 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046
1047 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001049 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051#ifdef HAVE_USABLE_WCHAR_T
1052 memcpy(w, unicode->str, size * sizeof(wchar_t));
1053#else
1054 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001055 register Py_UNICODE *u;
1056 register Py_ssize_t i;
1057 u = PyUnicode_AS_UNICODE(unicode);
1058 for (i = size; i > 0; i--)
1059 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061#endif
1062
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001063 if (size > PyUnicode_GET_SIZE(unicode))
1064 return PyUnicode_GET_SIZE(unicode);
1065 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001066 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067}
1068
1069#endif
1070
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001071PyObject *PyUnicode_FromOrdinal(int ordinal)
1072{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001073 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074
1075#ifdef Py_UNICODE_WIDE
1076 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x110000) "
1079 "(wide Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#else
1083 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001084 PyErr_SetString(PyExc_ValueError,
1085 "unichr() arg not in range(0x10000) "
1086 "(narrow Python build)");
1087 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001088 }
1089#endif
1090
Hye-Shik Chang40574832004-04-06 07:24:51 +00001091 s[0] = (Py_UNICODE)ordinal;
1092 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001093}
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095PyObject *PyUnicode_FromObject(register PyObject *obj)
1096{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001097 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001098 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001100 Py_INCREF(obj);
1101 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001102 }
1103 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 /* For a Unicode subtype that's not a Unicode object,
1105 return a true Unicode object with the same data. */
1106 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1107 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001108 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1110}
1111
1112PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001113 const char *encoding,
1114 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001116 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001119
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 PyErr_BadInternalCall();
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125#if 0
1126 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001127 that no encodings is given and then redirect to
1128 PyObject_Unicode() which then applies the additional logic for
1129 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001130
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001131 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001133
1134 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001135 if (PyUnicode_Check(obj)) {
1136 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001137 PyErr_SetString(PyExc_TypeError,
1138 "decoding Unicode is not supported");
1139 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001141 return PyObject_Unicode(obj);
1142 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143#else
1144 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001145 PyErr_SetString(PyExc_TypeError,
1146 "decoding Unicode is not supported");
1147 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001148 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001149#endif
1150
1151 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001152 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001153 s = PyString_AS_STRING(obj);
1154 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001155 }
Christian Heimes3497f942008-05-26 12:29:14 +00001156 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001157 /* Python 2.x specific */
1158 PyErr_Format(PyExc_TypeError,
1159 "decoding bytearray is not supported");
1160 return NULL;
1161 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001162 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 /* Overwrite the error message with something more useful in
1164 case of a TypeError. */
1165 if (PyErr_ExceptionMatches(PyExc_TypeError))
1166 PyErr_Format(PyExc_TypeError,
1167 "coercing to Unicode: need string or buffer, "
1168 "%.80s found",
1169 Py_TYPE(obj)->tp_name);
1170 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001173 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001174 if (len == 0)
1175 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001176
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001177 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001178 return v;
1179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182}
1183
1184PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 Py_ssize_t size,
1186 const char *encoding,
1187 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188{
1189 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001190
1191 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001192 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001193
1194 /* Shortcuts for common default encodings */
1195 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "latin-1") == 0)
1198 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001199#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1200 else if (strcmp(encoding, "mbcs") == 0)
1201 return PyUnicode_DecodeMBCS(s, size, errors);
1202#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001203 else if (strcmp(encoding, "ascii") == 0)
1204 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
1207 buffer = PyBuffer_FromMemory((void *)s, size);
1208 if (buffer == NULL)
1209 goto onError;
1210 unicode = PyCodec_Decode(buffer, encoding, errors);
1211 if (unicode == NULL)
1212 goto onError;
1213 if (!PyUnicode_Check(unicode)) {
1214 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001215 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001216 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 Py_DECREF(unicode);
1218 goto onError;
1219 }
1220 Py_DECREF(buffer);
1221 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001222
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 Py_XDECREF(buffer);
1225 return NULL;
1226}
1227
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001228PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
1233
1234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
1238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001241
1242 /* Decode via the codec registry */
1243 v = PyCodec_Decode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
1246 return v;
1247
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001248 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001249 return NULL;
1250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001253 Py_ssize_t size,
1254 const char *encoding,
1255 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256{
1257 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 unicode = PyUnicode_FromUnicode(s, size);
1260 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1263 Py_DECREF(unicode);
1264 return v;
1265}
1266
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001267PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1268 const char *encoding,
1269 const char *errors)
1270{
1271 PyObject *v;
1272
1273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_BadArgument();
1275 goto onError;
1276 }
1277
1278 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001279 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001280
1281 /* Encode via the codec registry */
1282 v = PyCodec_Encode(unicode, encoding, errors);
1283 if (v == NULL)
1284 goto onError;
1285 return v;
1286
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001287 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001288 return NULL;
1289}
1290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1292 const char *encoding,
1293 const char *errors)
1294{
1295 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001296
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 goto onError;
1300 }
Fred Drakee4315f52000-05-09 19:53:39 +00001301
Tim Petersced69f82003-09-16 20:30:58 +00001302 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001303 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001304
1305 /* Shortcuts for common default encodings */
1306 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001307 if (strcmp(encoding, "utf-8") == 0)
1308 return PyUnicode_AsUTF8String(unicode);
1309 else if (strcmp(encoding, "latin-1") == 0)
1310 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001311#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001312 else if (strcmp(encoding, "mbcs") == 0)
1313 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001314#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 else if (strcmp(encoding, "ascii") == 0)
1316 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318
1319 /* Encode via the codec registry */
1320 v = PyCodec_Encode(unicode, encoding, errors);
1321 if (v == NULL)
1322 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001323 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001325 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001326 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 Py_DECREF(v);
1328 goto onError;
1329 }
1330 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001331
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 return NULL;
1334}
1335
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001336PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001337 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001338{
1339 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1340
1341 if (v)
1342 return v;
1343 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1344 if (v && errors == NULL)
1345 ((PyUnicodeObject *)unicode)->defenc = v;
1346 return v;
1347}
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1350{
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
1355 return PyUnicode_AS_UNICODE(unicode);
1356
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001357 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 return NULL;
1359}
1360
Martin v. Löwis18e16552006-02-15 17:27:45 +00001361Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362{
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367 return PyUnicode_GET_SIZE(unicode);
1368
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 return -1;
1371}
1372
Thomas Wouters78890102000-07-22 19:25:51 +00001373const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001374{
1375 return unicode_default_encoding;
1376}
1377
1378int PyUnicode_SetDefaultEncoding(const char *encoding)
1379{
1380 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Fred Drakee4315f52000-05-09 19:53:39 +00001382 /* Make sure the encoding is valid. As side effect, this also
1383 loads the encoding into the codec registry cache. */
1384 v = _PyCodec_Lookup(encoding);
1385 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001387 Py_DECREF(v);
1388 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001390 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001391 return 0;
1392
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001393 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001394 return -1;
1395}
1396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397/* error handling callback helper:
1398 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001399 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001400 and adjust various state variables.
1401 return 0 on success, -1 on error
1402*/
1403
1404static
1405int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 const char *encoding, const char *reason,
1407 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1408 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1409 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412
1413 PyObject *restuple = NULL;
1414 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1416 Py_ssize_t requiredsize;
1417 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 int res = -1;
1421
1422 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001423 *errorHandler = PyCodec_LookupError(errors);
1424 if (*errorHandler == NULL)
1425 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 }
1427
1428 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001429 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001430 encoding, input, insize, *startinpos, *endinpos, reason);
1431 if (*exceptionObject == NULL)
1432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 }
1434 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001435 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1436 goto onError;
1437 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1438 goto onError;
1439 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1440 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441 }
1442
1443 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1444 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001447 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 }
1450 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001453 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001454 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1456 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458
1459 /* need more space? (at least enough for what we
1460 have+the replacement+the rest of the string (starting
1461 at the new input position), so we won't have to check space
1462 when there are no errors in the rest of the string) */
1463 repptr = PyUnicode_AS_UNICODE(repunicode);
1464 repsize = PyUnicode_GET_SIZE(repunicode);
1465 requiredsize = *outpos + repsize + insize-newpos;
1466 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001467 if (requiredsize<2*outsize)
1468 requiredsize = 2*outsize;
1469 if (_PyUnicode_Resize(output, requiredsize) < 0)
1470 goto onError;
1471 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 }
1473 *endinpos = newpos;
1474 *inptr = input + newpos;
1475 Py_UNICODE_COPY(*outptr, repptr, repsize);
1476 *outptr += repsize;
1477 *outpos += repsize;
1478 /* we made it! */
1479 res = 0;
1480
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001481 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 Py_XDECREF(restuple);
1483 return res;
1484}
1485
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486/* --- UTF-7 Codec -------------------------------------------------------- */
1487
Antoine Pitrou653dece2009-05-04 18:32:32 +00001488/* See RFC2152 for details. We encode conservatively and decode liberally. */
1489
1490/* Three simple macros defining base-64. */
1491
1492/* Is c a base-64 character? */
1493
1494#define IS_BASE64(c) \
1495 (isalnum(c) || (c) == '+' || (c) == '/')
1496
1497/* given that c is a base-64 character, what is its base-64 value? */
1498
1499#define FROM_BASE64(c) \
1500 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1501 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1502 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1503 (c) == '+' ? 62 : 63)
1504
1505/* What is the base-64 character of the bottom 6 bits of n? */
1506
1507#define TO_BASE64(n) \
1508 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1509
1510/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1511 * decoded as itself. We are permissive on decoding; the only ASCII
1512 * byte not decoding to itself is the + which begins a base64
1513 * string. */
1514
1515#define DECODE_DIRECT(c) \
1516 ((c) <= 127 && (c) != '+')
1517
1518/* The UTF-7 encoder treats ASCII characters differently according to
1519 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1520 * the above). See RFC2152. This array identifies these different
1521 * sets:
1522 * 0 : "Set D"
1523 * alphanumeric and '(),-./:?
1524 * 1 : "Set O"
1525 * !"#$%&*;<=>@[]^_`{|}
1526 * 2 : "whitespace"
1527 * ht nl cr sp
1528 * 3 : special (must be base64 encoded)
1529 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1530 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531
Tim Petersced69f82003-09-16 20:30:58 +00001532static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001533char utf7_category[128] = {
1534/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1535 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1536/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1537 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1538/* sp ! " # $ % & ' ( ) * + , - . / */
1539 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1540/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1542/* @ A B C D E F G H I J K L M N O */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1546/* ` a b c d e f g h i j k l m n o */
1547 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1548/* p q r s t u v w x y z { | } ~ del */
1549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550};
1551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552/* ENCODE_DIRECT: this character should be encoded as itself. The
1553 * answer depends on whether we are encoding set O as itself, and also
1554 * on whether we are encoding whitespace as itself. RFC2152 makes it
1555 * clear that the answers to these questions vary between
1556 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001557
Antoine Pitrou653dece2009-05-04 18:32:32 +00001558#define ENCODE_DIRECT(c, directO, directWS) \
1559 ((c) < 128 && (c) > 0 && \
1560 ((utf7_category[(c)] == 0) || \
1561 (directWS && (utf7_category[(c)] == 2)) || \
1562 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001565 Py_ssize_t size,
1566 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001568 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1569}
1570
Antoine Pitrou653dece2009-05-04 18:32:32 +00001571/* The decoder. The only state we preserve is our read position,
1572 * i.e. how many characters we have consumed. So if we end in the
1573 * middle of a shift sequence we have to back off the read position
1574 * and the output to the beginning of the sequence, otherwise we lose
1575 * all the shift state (seen bits, number of bits seen, high
1576 * surrogate). */
1577
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001579 Py_ssize_t size,
1580 const char *errors,
1581 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t startinpos;
1585 Py_ssize_t endinpos;
1586 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 const char *e;
1588 PyUnicodeObject *unicode;
1589 Py_UNICODE *p;
1590 const char *errmsg = "";
1591 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001592 Py_UNICODE *shiftOutStart;
1593 unsigned int base64bits = 0;
1594 unsigned long base64buffer = 0;
1595 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 PyObject *errorHandler = NULL;
1597 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598
1599 unicode = _PyUnicode_New(size);
1600 if (!unicode)
1601 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 if (size == 0) {
1603 if (consumed)
1604 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
1608 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 e = s + size;
1611
1612 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001613 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614
Antoine Pitrou653dece2009-05-04 18:32:32 +00001615 if (inShift) { /* in a base-64 section */
1616 if (IS_BASE64(ch)) { /* consume a base-64 character */
1617 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1618 base64bits += 6;
1619 s++;
1620 if (base64bits >= 16) {
1621 /* we have enough bits for a UTF-16 value */
1622 Py_UNICODE outCh = (Py_UNICODE)
1623 (base64buffer >> (base64bits-16));
1624 base64bits -= 16;
1625 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1626 if (surrogate) {
1627 /* expecting a second surrogate */
1628 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1629#ifdef Py_UNICODE_WIDE
1630 *p++ = (((surrogate & 0x3FF)<<10)
1631 | (outCh & 0x3FF)) + 0x10000;
1632#else
1633 *p++ = surrogate;
1634 *p++ = outCh;
1635#endif
1636 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001637 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001638 }
1639 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001640 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001641 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 }
1643 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001644 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001645 /* first surrogate */
1646 surrogate = outCh;
1647 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001657 *p++ = surrogate;
1658 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002210 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002211 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002212#else
2213 const int pairs = 0;
2214#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002215 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002216 int bo = 0; /* assume native ordering by default */
2217 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002218 /* Offsets from q for retrieving bytes in the right order. */
2219#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder[] = {0, 1, 2, 3};
2221#else
2222 int iorder[] = {3, 2, 1, 0};
2223#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002224 PyObject *errorHandler = NULL;
2225 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002226
Walter Dörwald6e390802007-08-17 16:41:28 +00002227 q = (unsigned char *)s;
2228 e = q + size;
2229
2230 if (byteorder)
2231 bo = *byteorder;
2232
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2237 if (bo == 0) {
2238 if (size >= 4) {
2239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002240 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = -1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = 1;
2249 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002250#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 if (bom == 0x0000FEFF) {
2252 q += 4;
2253 bo = 1;
2254 }
2255 else if (bom == 0xFFFE0000) {
2256 q += 4;
2257 bo = -1;
2258 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002261 }
2262
2263 if (bo == -1) {
2264 /* force LE */
2265 iorder[0] = 0;
2266 iorder[1] = 1;
2267 iorder[2] = 2;
2268 iorder[3] = 3;
2269 }
2270 else if (bo == 1) {
2271 /* force BE */
2272 iorder[0] = 3;
2273 iorder[1] = 2;
2274 iorder[2] = 1;
2275 iorder[3] = 0;
2276 }
2277
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002281 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283 pairs++;
2284#endif
2285
2286 /* This might be one to much, because of a BOM */
2287 unicode = _PyUnicode_New((size+3)/4+pairs);
2288 if (!unicode)
2289 return NULL;
2290 if (size == 0)
2291 return (PyObject *)unicode;
2292
2293 /* Unpack UTF-32 encoded data */
2294 p = unicode->str;
2295
Walter Dörwald6e390802007-08-17 16:41:28 +00002296 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2308 }
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (ch >= 0x110000)
2313 {
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2318 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 if (ch >= 0x10000)
2321 {
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324 }
2325 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002336 &unicode, &outpos, &p))
2337 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 }
2339
2340 if (byteorder)
2341 *byteorder = bo;
2342
2343 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002345
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2349
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002354 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
2361PyObject *
2362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002366{
2367 PyObject *v;
2368 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#else
2373 const int pairs = 0;
2374#endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378#else
2379 int iorder[] = {3, 2, 1, 0};
2380#endif
2381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382#define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002389 } while(0)
2390
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393#ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (v == NULL)
2405 return NULL;
2406
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002407 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (size == 0)
2411 return v;
2412
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2419 }
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2426 }
2427
2428 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002430#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2437 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002438 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#endif
2440 STORECHAR(ch);
2441 }
2442 return v;
2443#undef STORECHAR
2444}
2445
2446PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447{
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2451 }
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2455 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456}
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458/* --- UTF-16 Codec ------------------------------------------------------- */
2459
Tim Peters772747b2001-08-09 22:21:55 +00002460PyObject *
2461PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465{
Walter Dörwald69652032004-09-07 20:24:22 +00002466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467}
2468
2469PyObject *
2470PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002485 /* Offsets from q for retrieving byte pairs in the right order. */
2486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488#else
2489 int ihi = 0, ilo = 1;
2490#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2501
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002504 q = (unsigned char *)s;
2505 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002508 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Tim Peters772747b2001-08-09 22:21:55 +00002539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2543 }
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2548 }
2549
2550 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2562 }
2563 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564
Benjamin Peterson857ce152009-01-31 16:29:18 +00002565 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2570 }
2571
2572 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002573 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002574 q -= 2;
2575 if (consumed)
2576 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002578 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002579 endinpos = ((const char *)e)-starts;
2580 goto utf16Error;
2581 }
2582 if (0xD800 <= ch && ch <= 0xDBFF) {
2583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2584 q += 2;
2585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002586#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 *p++ = ch;
2588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002591#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 continue;
2593 }
2594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002595 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 startinpos = (((const char *)q)-4)-starts;
2597 endinpos = startinpos+2;
2598 goto utf16Error;
2599 }
2600
Benjamin Peterson857ce152009-01-31 16:29:18 +00002601 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002602 errmsg = "illegal encoding";
2603 startinpos = (((const char *)q)-2)-starts;
2604 endinpos = startinpos+2;
2605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002606
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002607 utf16Error:
2608 outpos = p-PyUnicode_AS_UNICODE(unicode);
2609 if (unicode_decode_call_errorhandler(
2610 errors, &errorHandler,
2611 "utf16", errmsg,
2612 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2613 &unicode, &outpos, &p))
2614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 }
2616
2617 if (byteorder)
2618 *byteorder = bo;
2619
Walter Dörwald69652032004-09-07 20:24:22 +00002620 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002621 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002624 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 goto onError;
2626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return (PyObject *)unicode;
2630
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 Py_XDECREF(errorHandler);
2634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 return NULL;
2636}
2637
Tim Peters772747b2001-08-09 22:21:55 +00002638PyObject *
2639PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 Py_ssize_t size,
2641 const char *errors,
2642 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643{
2644 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002645 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002646 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002647#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002648 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002649#else
2650 const int pairs = 0;
2651#endif
Tim Peters772747b2001-08-09 22:21:55 +00002652 /* Offsets from p for storing byte pairs in the right order. */
2653#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2654 int ihi = 1, ilo = 0;
2655#else
2656 int ihi = 0, ilo = 1;
2657#endif
2658
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659#define STORECHAR(CH) \
2660 do { \
2661 p[ihi] = ((CH) >> 8) & 0xff; \
2662 p[ilo] = (CH) & 0xff; \
2663 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002664 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002666#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002667 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 if (s[i] >= 0x10000)
2669 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002670#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002671 /* 2 * (size + pairs + (byteorder == 0)) */
2672 if (size > PY_SSIZE_T_MAX ||
2673 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002675 nsize = size + pairs + (byteorder == 0);
2676 bytesize = nsize * 2;
2677 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002678 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002679 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 if (v == NULL)
2681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002683 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002685 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002686 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002687 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002688
2689 if (byteorder == -1) {
2690 /* force LE */
2691 ihi = 1;
2692 ilo = 0;
2693 }
2694 else if (byteorder == 1) {
2695 /* force BE */
2696 ihi = 0;
2697 ilo = 1;
2698 }
2699
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002700 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 Py_UNICODE ch = *s++;
2702 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002703#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002704 if (ch >= 0x10000) {
2705 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2706 ch = 0xD800 | ((ch-0x10000) >> 10);
2707 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002708#endif
Tim Peters772747b2001-08-09 22:21:55 +00002709 STORECHAR(ch);
2710 if (ch2)
2711 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715}
2716
2717PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2718{
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 return NULL;
2722 }
2723 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002724 PyUnicode_GET_SIZE(unicode),
2725 NULL,
2726 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729/* --- Unicode Escape Codec ----------------------------------------------- */
2730
Fredrik Lundh06d12682001-01-24 07:59:11 +00002731static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002734 Py_ssize_t size,
2735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002738 Py_ssize_t startinpos;
2739 Py_ssize_t endinpos;
2740 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744 char* message;
2745 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 PyObject *errorHandler = NULL;
2747 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002748
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 /* Escaped strings will always be longer than the resulting
2750 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 length after conversion to the true value.
2752 (but if the error callback returns a long replacement string
2753 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 v = _PyUnicode_New(size);
2755 if (v == NULL)
2756 goto onError;
2757 if (size == 0)
2758 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 while (s < end) {
2764 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002765 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767
2768 /* Non-escape characters are interpreted as Unicode ordinals */
2769 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 continue;
2772 }
2773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 /* \ - Escapes */
2776 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002777 c = *s++;
2778 if (s > end)
2779 c = '\0'; /* Invalid after \ */
2780 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002782 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 case '\n': break;
2784 case '\\': *p++ = '\\'; break;
2785 case '\'': *p++ = '\''; break;
2786 case '\"': *p++ = '\"'; break;
2787 case 'b': *p++ = '\b'; break;
2788 case 'f': *p++ = '\014'; break; /* FF */
2789 case 't': *p++ = '\t'; break;
2790 case 'n': *p++ = '\n'; break;
2791 case 'r': *p++ = '\r'; break;
2792 case 'v': *p++ = '\013'; break; /* VT */
2793 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002795 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 case '0': case '1': case '2': case '3':
2797 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002799 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002801 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002804 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 break;
2806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002807 /* hex escapes */
2808 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810 digits = 2;
2811 message = "truncated \\xXX escape";
2812 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002814 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 digits = 4;
2817 message = "truncated \\uXXXX escape";
2818 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002820 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002821 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 digits = 8;
2823 message = "truncated \\UXXXXXXXX escape";
2824 hexescape:
2825 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002826 if (end - s < digits) {
2827 /* count only hex digits */
2828 for (; s < end; ++s) {
2829 c = (unsigned char)*s;
2830 if (!Py_ISXDIGIT(c))
2831 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002832 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002833 goto error;
2834 }
2835 for (; digits--; ++s) {
2836 c = (unsigned char)*s;
2837 if (!Py_ISXDIGIT(c))
2838 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002839 chr = (chr<<4) & ~0xF;
2840 if (c >= '0' && c <= '9')
2841 chr += c - '0';
2842 else if (c >= 'a' && c <= 'f')
2843 chr += 10 + c - 'a';
2844 else
2845 chr += 10 + c - 'A';
2846 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002847 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 /* _decoding_error will have already written into the
2849 target buffer. */
2850 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002851 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002852 /* when we get here, chr is a 32-bit unicode character */
2853 if (chr <= 0xffff)
2854 /* UCS-2 character */
2855 *p++ = (Py_UNICODE) chr;
2856 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002857 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002858 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002859#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002860 *p++ = chr;
2861#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002862 chr -= 0x10000L;
2863 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002864 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002865#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002866 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002867 message = "illegal Unicode character";
2868 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 break;
2871
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002872 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002873 case 'N':
2874 message = "malformed \\N character escape";
2875 if (ucnhash_CAPI == NULL) {
2876 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002877 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 if (ucnhash_CAPI == NULL)
2879 goto ucnhashError;
2880 }
2881 if (*s == '{') {
2882 const char *start = s+1;
2883 /* look for the closing brace */
2884 while (*s != '}' && s < end)
2885 s++;
2886 if (s > start && s < end && *s == '}') {
2887 /* found a name. look it up in the unicode database */
2888 message = "unknown Unicode character name";
2889 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002890 if (s - start - 1 <= INT_MAX &&
2891 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 goto store;
2893 }
2894 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002895 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002896
2897 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002898 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 message = "\\ at end of string";
2900 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002901 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002902 }
2903 else {
2904 *p++ = '\\';
2905 *p++ = (unsigned char)s[-1];
2906 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002907 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002909 continue;
2910
2911 error:
2912 endinpos = s-starts;
2913 outpos = p-PyUnicode_AS_UNICODE(v);
2914 if (unicode_decode_call_errorhandler(
2915 errors, &errorHandler,
2916 "unicodeescape", message,
2917 starts, size, &startinpos, &endinpos, &exc, &s,
2918 &v, &outpos, &p))
2919 goto onError;
2920 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002922 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002924 Py_XDECREF(errorHandler);
2925 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002927
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002928 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002929 PyErr_SetString(
2930 PyExc_UnicodeError,
2931 "\\N escapes not supported (can't load unicodedata module)"
2932 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002933 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934 Py_XDECREF(errorHandler);
2935 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002936 return NULL;
2937
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002938 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 Py_XDECREF(errorHandler);
2941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 return NULL;
2943}
2944
2945/* Return a Unicode-Escape string version of the Unicode object.
2946
2947 If quotes is true, the string is enclosed in u"" or u'' quotes as
2948 appropriate.
2949
2950*/
2951
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002952Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002953 Py_ssize_t size,
2954 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002955{
2956 /* like wcschr, but doesn't stop at NULL characters */
2957
2958 while (size-- > 0) {
2959 if (*s == ch)
2960 return s;
2961 s++;
2962 }
2963
2964 return NULL;
2965}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967static
2968PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002969 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 int quotes)
2971{
2972 PyObject *repr;
2973 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002975 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002976#ifdef Py_UNICODE_WIDE
2977 const Py_ssize_t expandsize = 10;
2978#else
2979 const Py_ssize_t expandsize = 6;
2980#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
Neal Norwitz17753ec2006-08-21 22:21:19 +00002982 /* XXX(nnorwitz): rather than over-allocating, it would be
2983 better to choose a different scheme. Perhaps scan the
2984 first N-chars of the string and allocate based on that size.
2985 */
2986 /* Initial allocation is based on the longest-possible unichr
2987 escape.
2988
2989 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2990 unichr, so in this case it's the longest unichr escape. In
2991 narrow (UTF-16) builds this is five chars per source unichr
2992 since there are two unichrs in the surrogate pair, so in narrow
2993 (UTF-16) builds it's not the longest unichr escape.
2994
2995 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2996 so in the narrow (UTF-16) build case it's the longest unichr
2997 escape.
2998 */
2999
Neal Norwitze7d8be82008-07-31 17:17:14 +00003000 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003001 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003002
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003003 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003004 2
3005 + expandsize*size
3006 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 if (repr == NULL)
3008 return NULL;
3009
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003010 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011
3012 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003014 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 !findchar(s, size, '"')) ? '"' : '\'';
3016 }
3017 while (size-- > 0) {
3018 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003019
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003020 /* Escape quotes and backslashes */
3021 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003022 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 *p++ = '\\';
3024 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003025 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003026 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003027
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003028#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003029 /* Map 21-bit characters to '\U00xxxxxx' */
3030 else if (ch >= 0x10000) {
3031 *p++ = '\\';
3032 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3034 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3035 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3036 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3037 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3038 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3039 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003040 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003041 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003042 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003043#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003044 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3045 else if (ch >= 0xD800 && ch < 0xDC00) {
3046 Py_UNICODE ch2;
3047 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003048
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003049 ch2 = *s++;
3050 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003051 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003052 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3053 *p++ = '\\';
3054 *p++ = 'U';
3055 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3056 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3057 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3058 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3059 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3060 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3061 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3062 *p++ = hexdigit[ucs & 0x0000000F];
3063 continue;
3064 }
3065 /* Fall through: isolated surrogates are copied as-is */
3066 s--;
3067 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003068 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003069#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003072 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 *p++ = '\\';
3074 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003075 *p++ = hexdigit[(ch >> 12) & 0x000F];
3076 *p++ = hexdigit[(ch >> 8) & 0x000F];
3077 *p++ = hexdigit[(ch >> 4) & 0x000F];
3078 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003080
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003081 /* Map special whitespace to '\t', \n', '\r' */
3082 else if (ch == '\t') {
3083 *p++ = '\\';
3084 *p++ = 't';
3085 }
3086 else if (ch == '\n') {
3087 *p++ = '\\';
3088 *p++ = 'n';
3089 }
3090 else if (ch == '\r') {
3091 *p++ = '\\';
3092 *p++ = 'r';
3093 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003094
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003095 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003096 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003099 *p++ = hexdigit[(ch >> 4) & 0x000F];
3100 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003101 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003102
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 /* Copy everything else as-is */
3104 else
3105 *p++ = (char) ch;
3106 }
3107 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003108 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109
3110 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003111 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 return repr;
3114}
3115
3116PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003117 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118{
3119 return unicodeescape_string(s, size, 0);
3120}
3121
3122PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3123{
3124 if (!PyUnicode_Check(unicode)) {
3125 PyErr_BadArgument();
3126 return NULL;
3127 }
3128 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003129 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130}
3131
3132/* --- Raw Unicode Escape Codec ------------------------------------------- */
3133
3134PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003135 Py_ssize_t size,
3136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003139 Py_ssize_t startinpos;
3140 Py_ssize_t endinpos;
3141 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 const char *end;
3145 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 PyObject *errorHandler = NULL;
3147 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003148
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 /* Escaped strings will always be longer than the resulting
3150 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 length after conversion to the true value. (But decoding error
3152 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 v = _PyUnicode_New(size);
3154 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003157 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 end = s + size;
3160 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003161 unsigned char c;
3162 Py_UCS4 x;
3163 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003164 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003166 /* Non-escape characters are interpreted as Unicode ordinals */
3167 if (*s != '\\') {
3168 *p++ = (unsigned char)*s++;
3169 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003170 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003171 startinpos = s-starts;
3172
3173 /* \u-escapes are only interpreted iff the number of leading
3174 backslashes if odd */
3175 bs = s;
3176 for (;s < end;) {
3177 if (*s != '\\')
3178 break;
3179 *p++ = (unsigned char)*s++;
3180 }
3181 if (((s - bs) & 1) == 0 ||
3182 s >= end ||
3183 (*s != 'u' && *s != 'U')) {
3184 continue;
3185 }
3186 p--;
3187 count = *s=='u' ? 4 : 8;
3188 s++;
3189
3190 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3191 outpos = p-PyUnicode_AS_UNICODE(v);
3192 for (x = 0, i = 0; i < count; ++i, ++s) {
3193 c = (unsigned char)*s;
3194 if (!isxdigit(c)) {
3195 endinpos = s-starts;
3196 if (unicode_decode_call_errorhandler(
3197 errors, &errorHandler,
3198 "rawunicodeescape", "truncated \\uXXXX",
3199 starts, size, &startinpos, &endinpos, &exc, &s,
3200 &v, &outpos, &p))
3201 goto onError;
3202 goto nextByte;
3203 }
3204 x = (x<<4) & ~0xF;
3205 if (c >= '0' && c <= '9')
3206 x += c - '0';
3207 else if (c >= 'a' && c <= 'f')
3208 x += 10 + c - 'a';
3209 else
3210 x += 10 + c - 'A';
3211 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003212 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003213 /* UCS-2 character */
3214 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003215 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003216 /* UCS-4 character. Either store directly, or as
3217 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003218#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003219 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003220#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003221 x -= 0x10000L;
3222 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3223 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003224#endif
3225 } else {
3226 endinpos = s-starts;
3227 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003228 if (unicode_decode_call_errorhandler(
3229 errors, &errorHandler,
3230 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003231 starts, size, &startinpos, &endinpos, &exc, &s,
3232 &v, &outpos, &p))
3233 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 nextByte:
3236 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003238 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 Py_XDECREF(errorHandler);
3247 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 return NULL;
3249}
3250
3251PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
3254 PyObject *repr;
3255 char *p;
3256 char *q;
3257
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003258 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003260 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003262 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003263#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003264
Neal Norwitze7d8be82008-07-31 17:17:14 +00003265 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003266 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003267
Neal Norwitze7d8be82008-07-31 17:17:14 +00003268 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 if (repr == NULL)
3270 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003271 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003272 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003274 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 while (size-- > 0) {
3276 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003278 /* Map 32-bit characters to '\Uxxxxxxxx' */
3279 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280 *p++ = '\\';
3281 *p++ = 'U';
3282 *p++ = hexdigit[(ch >> 28) & 0xf];
3283 *p++ = hexdigit[(ch >> 24) & 0xf];
3284 *p++ = hexdigit[(ch >> 20) & 0xf];
3285 *p++ = hexdigit[(ch >> 16) & 0xf];
3286 *p++ = hexdigit[(ch >> 12) & 0xf];
3287 *p++ = hexdigit[(ch >> 8) & 0xf];
3288 *p++ = hexdigit[(ch >> 4) & 0xf];
3289 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003290 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003291 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003292#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003293 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3294 if (ch >= 0xD800 && ch < 0xDC00) {
3295 Py_UNICODE ch2;
3296 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003298 ch2 = *s++;
3299 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003300 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3302 *p++ = '\\';
3303 *p++ = 'U';
3304 *p++ = hexdigit[(ucs >> 28) & 0xf];
3305 *p++ = hexdigit[(ucs >> 24) & 0xf];
3306 *p++ = hexdigit[(ucs >> 20) & 0xf];
3307 *p++ = hexdigit[(ucs >> 16) & 0xf];
3308 *p++ = hexdigit[(ucs >> 12) & 0xf];
3309 *p++ = hexdigit[(ucs >> 8) & 0xf];
3310 *p++ = hexdigit[(ucs >> 4) & 0xf];
3311 *p++ = hexdigit[ucs & 0xf];
3312 continue;
3313 }
3314 /* Fall through: isolated surrogates are copied as-is */
3315 s--;
3316 size++;
3317 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003318#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003319 /* Map 16-bit characters to '\uxxxx' */
3320 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 *p++ = '\\';
3322 *p++ = 'u';
3323 *p++ = hexdigit[(ch >> 12) & 0xf];
3324 *p++ = hexdigit[(ch >> 8) & 0xf];
3325 *p++ = hexdigit[(ch >> 4) & 0xf];
3326 *p++ = hexdigit[ch & 15];
3327 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003328 /* Copy everything else as-is */
3329 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 *p++ = (char) ch;
3331 }
3332 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003333 if (_PyString_Resize(&repr, p - q))
3334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 return repr;
3336}
3337
3338PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3339{
3340 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003341 PyErr_BadArgument();
3342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 }
3344 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346}
3347
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003348/* --- Unicode Internal Codec ------------------------------------------- */
3349
3350PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 Py_ssize_t size,
3352 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353{
3354 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003355 Py_ssize_t startinpos;
3356 Py_ssize_t endinpos;
3357 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003358 PyUnicodeObject *v;
3359 Py_UNICODE *p;
3360 const char *end;
3361 const char *reason;
3362 PyObject *errorHandler = NULL;
3363 PyObject *exc = NULL;
3364
Neal Norwitzd43069c2006-01-08 01:12:10 +00003365#ifdef Py_UNICODE_WIDE
3366 Py_UNICODE unimax = PyUnicode_GetMax();
3367#endif
3368
Armin Rigo7ccbca92006-10-04 12:17:45 +00003369 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3371 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003372 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003373 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 p = PyUnicode_AS_UNICODE(v);
3376 end = s + size;
3377
3378 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003379 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003380 /* We have to sanity check the raw data, otherwise doom looms for
3381 some malformed UCS-4 data. */
3382 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003383#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003384 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003385#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 end-s < Py_UNICODE_SIZE
3387 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003388 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 startinpos = s - starts;
3390 if (end-s < Py_UNICODE_SIZE) {
3391 endinpos = end-starts;
3392 reason = "truncated input";
3393 }
3394 else {
3395 endinpos = s - starts + Py_UNICODE_SIZE;
3396 reason = "illegal code point (> 0x10FFFF)";
3397 }
3398 outpos = p - PyUnicode_AS_UNICODE(v);
3399 if (unicode_decode_call_errorhandler(
3400 errors, &errorHandler,
3401 "unicode_internal", reason,
3402 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003403 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003404 goto onError;
3405 }
3406 }
3407 else {
3408 p++;
3409 s += Py_UNICODE_SIZE;
3410 }
3411 }
3412
Martin v. Löwis412fb672006-04-13 06:34:32 +00003413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003414 goto onError;
3415 Py_XDECREF(errorHandler);
3416 Py_XDECREF(exc);
3417 return (PyObject *)v;
3418
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003419 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 Py_XDECREF(v);
3421 Py_XDECREF(errorHandler);
3422 Py_XDECREF(exc);
3423 return NULL;
3424}
3425
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426/* --- Latin-1 Codec ------------------------------------------------------ */
3427
3428PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003429 Py_ssize_t size,
3430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431{
3432 PyUnicodeObject *v;
3433 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003434
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003436 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003437 Py_UNICODE r = *(unsigned char*)s;
3438 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003439 }
3440
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 v = _PyUnicode_New(size);
3442 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 p = PyUnicode_AS_UNICODE(v);
3447 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003450
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003451 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 Py_XDECREF(v);
3453 return NULL;
3454}
3455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456/* create or adjust a UnicodeEncodeError */
3457static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003458 const char *encoding,
3459 const Py_UNICODE *unicode, Py_ssize_t size,
3460 Py_ssize_t startpos, Py_ssize_t endpos,
3461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 *exceptionObject = PyUnicodeEncodeError_Create(
3465 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
3467 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3469 goto onError;
3470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3471 goto onError;
3472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3473 goto onError;
3474 return;
3475 onError:
3476 Py_DECREF(*exceptionObject);
3477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 }
3479}
3480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481/* raises a UnicodeEncodeError */
3482static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 const char *encoding,
3484 const Py_UNICODE *unicode, Py_ssize_t size,
3485 Py_ssize_t startpos, Py_ssize_t endpos,
3486 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487{
3488 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003489 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003491 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492}
3493
3494/* error handling callback helper:
3495 build arguments, call the callback and check the arguments,
3496 put the result into newpos and return the replacement string, which
3497 has to be freed by the caller */
3498static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 PyObject **errorHandler,
3500 const char *encoding, const char *reason,
3501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3502 Py_ssize_t startpos, Py_ssize_t endpos,
3503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506
3507 PyObject *restuple;
3508 PyObject *resunicode;
3509
3510 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003511 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 }
3515
3516 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003517 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520
3521 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003524 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003526 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 Py_DECREF(restuple);
3528 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 }
3530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003531 &resunicode, newpos)) {
3532 Py_DECREF(restuple);
3533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 }
3535 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003537 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3539 Py_DECREF(restuple);
3540 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 Py_INCREF(resunicode);
3543 Py_DECREF(restuple);
3544 return resunicode;
3545}
3546
3547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 Py_ssize_t size,
3549 const char *errors,
3550 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551{
3552 /* output object */
3553 PyObject *res;
3554 /* pointers to the beginning and end+1 of input */
3555 const Py_UNICODE *startp = p;
3556 const Py_UNICODE *endp = p + size;
3557 /* pointer to the beginning of the unencodable characters */
3558 /* const Py_UNICODE *badp = NULL; */
3559 /* pointer into the output */
3560 char *str;
3561 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t respos = 0;
3563 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003564 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 PyObject *errorHandler = NULL;
3567 PyObject *exc = NULL;
3568 /* the following variable is used for caching string comparisons
3569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3570 int known_errorHandler = -1;
3571
3572 /* allocate enough for a simple encoding without
3573 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003574 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 if (res == NULL)
3576 goto onError;
3577 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003578 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003579 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 ressize = size;
3581
3582 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003585 /* can we encode this? */
3586 if (c<limit) {
3587 /* no overflow check, because we know that the space is enough */
3588 *str++ = (char)c;
3589 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003590 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003591 else {
3592 Py_ssize_t unicodepos = p-startp;
3593 Py_ssize_t requiredsize;
3594 PyObject *repunicode;
3595 Py_ssize_t repsize;
3596 Py_ssize_t newpos;
3597 Py_ssize_t respos;
3598 Py_UNICODE *uni2;
3599 /* startpos for collecting unencodable chars */
3600 const Py_UNICODE *collstart = p;
3601 const Py_UNICODE *collend = p;
3602 /* find all unecodable characters */
3603 while ((collend < endp) && ((*collend)>=limit))
3604 ++collend;
3605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3606 if (known_errorHandler==-1) {
3607 if ((errors==NULL) || (!strcmp(errors, "strict")))
3608 known_errorHandler = 1;
3609 else if (!strcmp(errors, "replace"))
3610 known_errorHandler = 2;
3611 else if (!strcmp(errors, "ignore"))
3612 known_errorHandler = 3;
3613 else if (!strcmp(errors, "xmlcharrefreplace"))
3614 known_errorHandler = 4;
3615 else
3616 known_errorHandler = 0;
3617 }
3618 switch (known_errorHandler) {
3619 case 1: /* strict */
3620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3621 goto onError;
3622 case 2: /* replace */
3623 while (collstart++<collend)
3624 *str++ = '?'; /* fall through */
3625 case 3: /* ignore */
3626 p = collend;
3627 break;
3628 case 4: /* xmlcharrefreplace */
3629 respos = str-PyString_AS_STRING(res);
3630 /* determine replacement size (temporarily (mis)uses p) */
3631 for (p = collstart, repsize = 0; p < collend; ++p) {
3632 if (*p<10)
3633 repsize += 2+1+1;
3634 else if (*p<100)
3635 repsize += 2+2+1;
3636 else if (*p<1000)
3637 repsize += 2+3+1;
3638 else if (*p<10000)
3639 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003640#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003641 else
3642 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003643#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003644 else if (*p<100000)
3645 repsize += 2+5+1;
3646 else if (*p<1000000)
3647 repsize += 2+6+1;
3648 else
3649 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003650#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003651 }
3652 requiredsize = respos+repsize+(endp-collend);
3653 if (requiredsize > ressize) {
3654 if (requiredsize<2*ressize)
3655 requiredsize = 2*ressize;
3656 if (_PyString_Resize(&res, requiredsize))
3657 goto onError;
3658 str = PyString_AS_STRING(res) + respos;
3659 ressize = requiredsize;
3660 }
3661 /* generate replacement (temporarily (mis)uses p) */
3662 for (p = collstart; p < collend; ++p) {
3663 str += sprintf(str, "&#%d;", (int)*p);
3664 }
3665 p = collend;
3666 break;
3667 default:
3668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3669 encoding, reason, startp, size, &exc,
3670 collstart-startp, collend-startp, &newpos);
3671 if (repunicode == NULL)
3672 goto onError;
3673 /* need more space? (at least enough for what we have+the
3674 replacement+the rest of the string, so we won't have to
3675 check space for encodable characters) */
3676 respos = str-PyString_AS_STRING(res);
3677 repsize = PyUnicode_GET_SIZE(repunicode);
3678 requiredsize = respos+repsize+(endp-collend);
3679 if (requiredsize > ressize) {
3680 if (requiredsize<2*ressize)
3681 requiredsize = 2*ressize;
3682 if (_PyString_Resize(&res, requiredsize)) {
3683 Py_DECREF(repunicode);
3684 goto onError;
3685 }
3686 str = PyString_AS_STRING(res) + respos;
3687 ressize = requiredsize;
3688 }
3689 /* check if there is anything unencodable in the replacement
3690 and copy it to the output */
3691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3692 c = *uni2;
3693 if (c >= limit) {
3694 raise_encode_exception(&exc, encoding, startp, size,
3695 unicodepos, unicodepos+1, reason);
3696 Py_DECREF(repunicode);
3697 goto onError;
3698 }
3699 *str = (char)c;
3700 }
3701 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003702 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003704 }
3705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003707 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003709 /* If this falls res will be NULL */
3710 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 Py_XDECREF(errorHandler);
3712 Py_XDECREF(exc);
3713 return res;
3714
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003715 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 Py_XDECREF(res);
3717 Py_XDECREF(errorHandler);
3718 Py_XDECREF(exc);
3719 return NULL;
3720}
3721
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003723 Py_ssize_t size,
3724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727}
3728
3729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3730{
3731 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 PyErr_BadArgument();
3733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 }
3735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003736 PyUnicode_GET_SIZE(unicode),
3737 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738}
3739
3740/* --- 7-bit ASCII Codec -------------------------------------------------- */
3741
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003743 Py_ssize_t size,
3744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 PyUnicodeObject *v;
3748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t startinpos;
3750 Py_ssize_t endinpos;
3751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 const char *e;
3753 PyObject *errorHandler = NULL;
3754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003757 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003758 Py_UNICODE r = *(unsigned char*)s;
3759 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003760 }
Tim Petersced69f82003-09-16 20:30:58 +00003761
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 v = _PyUnicode_New(size);
3763 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003766 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 e = s + size;
3769 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003770 register unsigned char c = (unsigned char)*s;
3771 if (c < 128) {
3772 *p++ = c;
3773 ++s;
3774 }
3775 else {
3776 startinpos = s-starts;
3777 endinpos = startinpos + 1;
3778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3779 if (unicode_decode_call_errorhandler(
3780 errors, &errorHandler,
3781 "ascii", "ordinal not in range(128)",
3782 starts, size, &startinpos, &endinpos, &exc, &s,
3783 &v, &outpos, &p))
3784 goto onError;
3785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 Py_XDECREF(errorHandler);
3791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003793
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003794 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 Py_XDECREF(errorHandler);
3797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 return NULL;
3799}
3800
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003802 Py_ssize_t size,
3803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806}
3807
3808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3809{
3810 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 PyErr_BadArgument();
3812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
3814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003815 PyUnicode_GET_SIZE(unicode),
3816 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817}
3818
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003822
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003823#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003824#define NEED_RETRY
3825#endif
3826
3827/* XXX This code is limited to "true" double-byte encodings, as
3828 a) it assumes an incomplete character consists of a single byte, and
3829 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003831
3832static int is_dbcs_lead_byte(const char *s, int offset)
3833{
3834 const char *curr = s + offset;
3835
3836 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003837 const char *prev = CharPrev(s, curr);
3838 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839 }
3840 return 0;
3841}
3842
3843/*
3844 * Decode MBCS string into unicode object. If 'final' is set, converts
3845 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3846 */
3847static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003848 const char *s, /* MBCS string */
3849 int size, /* sizeof MBCS string */
3850 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851{
3852 Py_UNICODE *p;
3853 Py_ssize_t n = 0;
3854 int usize = 0;
3855
3856 assert(size >= 0);
3857
3858 /* Skip trailing lead-byte unless 'final' is set */
3859 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003860 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003861
3862 /* First get the size of the result */
3863 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3865 if (usize == 0) {
3866 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3867 return -1;
3868 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003869 }
3870
3871 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003872 /* Create unicode object */
3873 *v = _PyUnicode_New(usize);
3874 if (*v == NULL)
3875 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003876 }
3877 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003878 /* Extend unicode object */
3879 n = PyUnicode_GET_SIZE(*v);
3880 if (_PyUnicode_Resize(v, n + usize) < 0)
3881 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003882 }
3883
3884 /* Do the conversion */
3885 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003886 p = PyUnicode_AS_UNICODE(*v) + n;
3887 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3888 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3889 return -1;
3890 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 }
3892
3893 return size;
3894}
3895
3896PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003897 Py_ssize_t size,
3898 const char *errors,
3899 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900{
3901 PyUnicodeObject *v = NULL;
3902 int done;
3903
3904 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003905 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003906
3907#ifdef NEED_RETRY
3908 retry:
3909 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003910 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003911 else
3912#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914
3915 if (done < 0) {
3916 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003917 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918 }
3919
3920 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922
3923#ifdef NEED_RETRY
3924 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003925 s += done;
3926 size -= done;
3927 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003928 }
3929#endif
3930
3931 return (PyObject *)v;
3932}
3933
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003934PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003935 Py_ssize_t size,
3936 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003937{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003938 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3939}
3940
3941/*
3942 * Convert unicode into string object (MBCS).
3943 * Returns 0 if succeed, -1 otherwise.
3944 */
3945static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003946 const Py_UNICODE *p, /* unicode */
3947 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003948{
3949 int mbcssize = 0;
3950 Py_ssize_t n = 0;
3951
3952 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953
3954 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003955 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003956 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3957 if (mbcssize == 0) {
3958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3959 return -1;
3960 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003961 }
3962
Martin v. Löwisd8251432006-06-14 05:21:04 +00003963 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003964 /* Create string object */
3965 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3966 if (*repr == NULL)
3967 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003968 }
3969 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003970 /* Extend string object */
3971 n = PyString_Size(*repr);
3972 if (_PyString_Resize(repr, n + mbcssize) < 0)
3973 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974 }
3975
3976 /* Do the conversion */
3977 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003978 char *s = PyString_AS_STRING(*repr) + n;
3979 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3981 return -1;
3982 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003983 }
3984
3985 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003986}
3987
3988PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 Py_ssize_t size,
3990 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003991{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992 PyObject *repr = NULL;
3993 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003994
Martin v. Löwisd8251432006-06-14 05:21:04 +00003995#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003998 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 else
4000#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004002
Martin v. Löwisd8251432006-06-14 05:21:04 +00004003 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004004 Py_XDECREF(repr);
4005 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004006 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004007
4008#ifdef NEED_RETRY
4009 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004010 p += INT_MAX;
4011 size -= INT_MAX;
4012 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013 }
4014#endif
4015
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004016 return repr;
4017}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004018
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004019PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4020{
4021 if (!PyUnicode_Check(unicode)) {
4022 PyErr_BadArgument();
4023 return NULL;
4024 }
4025 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004026 PyUnicode_GET_SIZE(unicode),
4027 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004028}
4029
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030#undef NEED_RETRY
4031
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004032#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034/* --- Character Mapping Codec -------------------------------------------- */
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004037 Py_ssize_t size,
4038 PyObject *mapping,
4039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004042 Py_ssize_t startinpos;
4043 Py_ssize_t endinpos;
4044 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 PyUnicodeObject *v;
4047 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004048 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 PyObject *errorHandler = NULL;
4050 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004051 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004052 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004053
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 /* Default to Latin-1 */
4055 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004056 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057
4058 v = _PyUnicode_New(size);
4059 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004062 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004065 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004066 mapstring = PyUnicode_AS_UNICODE(mapping);
4067 maplen = PyUnicode_GET_SIZE(mapping);
4068 while (s < e) {
4069 unsigned char ch = *s;
4070 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 if (ch < maplen)
4073 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 if (x == 0xfffe) {
4076 /* undefined mapping */
4077 outpos = p-PyUnicode_AS_UNICODE(v);
4078 startinpos = s-starts;
4079 endinpos = startinpos+1;
4080 if (unicode_decode_call_errorhandler(
4081 errors, &errorHandler,
4082 "charmap", "character maps to <undefined>",
4083 starts, size, &startinpos, &endinpos, &exc, &s,
4084 &v, &outpos, &p)) {
4085 goto onError;
4086 }
4087 continue;
4088 }
4089 *p++ = x;
4090 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004091 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004092 }
4093 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004094 while (s < e) {
4095 unsigned char ch = *s;
4096 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4099 w = PyInt_FromLong((long)ch);
4100 if (w == NULL)
4101 goto onError;
4102 x = PyObject_GetItem(mapping, w);
4103 Py_DECREF(w);
4104 if (x == NULL) {
4105 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4106 /* No mapping found means: mapping is undefined. */
4107 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004108 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004109 } else
4110 goto onError;
4111 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004114 if (x == Py_None)
4115 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004116 if (PyInt_Check(x)) {
4117 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004118 if (value == 0xFFFE)
4119 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004120 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004121 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004122 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004123 Py_DECREF(x);
4124 goto onError;
4125 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004126
4127#ifndef Py_UNICODE_WIDE
4128 if (value > 0xFFFF) {
4129 /* see the code for 1-n mapping below */
4130 if (extrachars < 2) {
4131 /* resize first */
4132 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4133 Py_ssize_t needed = 10 - extrachars;
4134 extrachars += needed;
4135 /* XXX overflow detection missing */
4136 if (_PyUnicode_Resize(&v,
4137 PyUnicode_GET_SIZE(v) + needed) < 0) {
4138 Py_DECREF(x);
4139 goto onError;
4140 }
4141 p = PyUnicode_AS_UNICODE(v) + oldpos;
4142 }
4143 value -= 0x10000;
4144 *p++ = 0xD800 | (value >> 10);
4145 *p++ = 0xDC00 | (value & 0x3FF);
4146 extrachars -= 2;
4147 }
4148 else
4149#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004150 *p++ = (Py_UNICODE)value;
4151 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004152 else if (PyUnicode_Check(x)) {
4153 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004154
Serhiy Storchaka95997452013-01-15 14:42:59 +02004155 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004156 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004157 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4158 if (value == 0xFFFE)
4159 goto Undefined;
4160 *p++ = value;
4161 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004162 else if (targetsize > 1) {
4163 /* 1-n mapping */
4164 if (targetsize > extrachars) {
4165 /* resize first */
4166 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4167 Py_ssize_t needed = (targetsize - extrachars) + \
4168 (targetsize << 2);
4169 extrachars += needed;
4170 /* XXX overflow detection missing */
4171 if (_PyUnicode_Resize(&v,
4172 PyUnicode_GET_SIZE(v) + needed) < 0) {
4173 Py_DECREF(x);
4174 goto onError;
4175 }
4176 p = PyUnicode_AS_UNICODE(v) + oldpos;
4177 }
4178 Py_UNICODE_COPY(p,
4179 PyUnicode_AS_UNICODE(x),
4180 targetsize);
4181 p += targetsize;
4182 extrachars -= targetsize;
4183 }
4184 /* 1-0 mapping: skip the character */
4185 }
4186 else {
4187 /* wrong return value */
4188 PyErr_SetString(PyExc_TypeError,
4189 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004190 Py_DECREF(x);
4191 goto onError;
4192 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004193 Py_DECREF(x);
4194 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004195 continue;
4196Undefined:
4197 /* undefined mapping */
4198 Py_XDECREF(x);
4199 outpos = p-PyUnicode_AS_UNICODE(v);
4200 startinpos = s-starts;
4201 endinpos = startinpos+1;
4202 if (unicode_decode_call_errorhandler(
4203 errors, &errorHandler,
4204 "charmap", "character maps to <undefined>",
4205 starts, size, &startinpos, &endinpos, &exc, &s,
4206 &v, &outpos, &p)) {
4207 goto onError;
4208 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
4211 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004212 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4213 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 Py_XDECREF(errorHandler);
4215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004217
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004218 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 Py_XDECREF(errorHandler);
4220 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221 Py_XDECREF(v);
4222 return NULL;
4223}
4224
Martin v. Löwis3f767792006-06-04 19:36:28 +00004225/* Charmap encoding: the lookup table */
4226
4227struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004228 PyObject_HEAD
4229 unsigned char level1[32];
4230 int count2, count3;
4231 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004232};
4233
4234static PyObject*
4235encoding_map_size(PyObject *obj, PyObject* args)
4236{
4237 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004238 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004239 128*map->count3);
4240}
4241
4242static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004243 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004244 PyDoc_STR("Return the size (in bytes) of this object") },
4245 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004246};
4247
4248static void
4249encoding_map_dealloc(PyObject* o)
4250{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004251 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004252}
4253
4254static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004255 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004256 "EncodingMap", /*tp_name*/
4257 sizeof(struct encoding_map), /*tp_basicsize*/
4258 0, /*tp_itemsize*/
4259 /* methods */
4260 encoding_map_dealloc, /*tp_dealloc*/
4261 0, /*tp_print*/
4262 0, /*tp_getattr*/
4263 0, /*tp_setattr*/
4264 0, /*tp_compare*/
4265 0, /*tp_repr*/
4266 0, /*tp_as_number*/
4267 0, /*tp_as_sequence*/
4268 0, /*tp_as_mapping*/
4269 0, /*tp_hash*/
4270 0, /*tp_call*/
4271 0, /*tp_str*/
4272 0, /*tp_getattro*/
4273 0, /*tp_setattro*/
4274 0, /*tp_as_buffer*/
4275 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4276 0, /*tp_doc*/
4277 0, /*tp_traverse*/
4278 0, /*tp_clear*/
4279 0, /*tp_richcompare*/
4280 0, /*tp_weaklistoffset*/
4281 0, /*tp_iter*/
4282 0, /*tp_iternext*/
4283 encoding_map_methods, /*tp_methods*/
4284 0, /*tp_members*/
4285 0, /*tp_getset*/
4286 0, /*tp_base*/
4287 0, /*tp_dict*/
4288 0, /*tp_descr_get*/
4289 0, /*tp_descr_set*/
4290 0, /*tp_dictoffset*/
4291 0, /*tp_init*/
4292 0, /*tp_alloc*/
4293 0, /*tp_new*/
4294 0, /*tp_free*/
4295 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004296};
4297
4298PyObject*
4299PyUnicode_BuildEncodingMap(PyObject* string)
4300{
4301 Py_UNICODE *decode;
4302 PyObject *result;
4303 struct encoding_map *mresult;
4304 int i;
4305 int need_dict = 0;
4306 unsigned char level1[32];
4307 unsigned char level2[512];
4308 unsigned char *mlevel1, *mlevel2, *mlevel3;
4309 int count2 = 0, count3 = 0;
4310
4311 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4312 PyErr_BadArgument();
4313 return NULL;
4314 }
4315 decode = PyUnicode_AS_UNICODE(string);
4316 memset(level1, 0xFF, sizeof level1);
4317 memset(level2, 0xFF, sizeof level2);
4318
4319 /* If there isn't a one-to-one mapping of NULL to \0,
4320 or if there are non-BMP characters, we need to use
4321 a mapping dictionary. */
4322 if (decode[0] != 0)
4323 need_dict = 1;
4324 for (i = 1; i < 256; i++) {
4325 int l1, l2;
4326 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004327#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004328 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004329#endif
4330 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004331 need_dict = 1;
4332 break;
4333 }
4334 if (decode[i] == 0xFFFE)
4335 /* unmapped character */
4336 continue;
4337 l1 = decode[i] >> 11;
4338 l2 = decode[i] >> 7;
4339 if (level1[l1] == 0xFF)
4340 level1[l1] = count2++;
4341 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004342 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343 }
4344
4345 if (count2 >= 0xFF || count3 >= 0xFF)
4346 need_dict = 1;
4347
4348 if (need_dict) {
4349 PyObject *result = PyDict_New();
4350 PyObject *key, *value;
4351 if (!result)
4352 return NULL;
4353 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004354 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004355 key = PyInt_FromLong(decode[i]);
4356 value = PyInt_FromLong(i);
4357 if (!key || !value)
4358 goto failed1;
4359 if (PyDict_SetItem(result, key, value) == -1)
4360 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004361 Py_DECREF(key);
4362 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004363 }
4364 return result;
4365 failed1:
4366 Py_XDECREF(key);
4367 Py_XDECREF(value);
4368 Py_DECREF(result);
4369 return NULL;
4370 }
4371
4372 /* Create a three-level trie */
4373 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4374 16*count2 + 128*count3 - 1);
4375 if (!result)
4376 return PyErr_NoMemory();
4377 PyObject_Init(result, &EncodingMapType);
4378 mresult = (struct encoding_map*)result;
4379 mresult->count2 = count2;
4380 mresult->count3 = count3;
4381 mlevel1 = mresult->level1;
4382 mlevel2 = mresult->level23;
4383 mlevel3 = mresult->level23 + 16*count2;
4384 memcpy(mlevel1, level1, 32);
4385 memset(mlevel2, 0xFF, 16*count2);
4386 memset(mlevel3, 0, 128*count3);
4387 count3 = 0;
4388 for (i = 1; i < 256; i++) {
4389 int o1, o2, o3, i2, i3;
4390 if (decode[i] == 0xFFFE)
4391 /* unmapped character */
4392 continue;
4393 o1 = decode[i]>>11;
4394 o2 = (decode[i]>>7) & 0xF;
4395 i2 = 16*mlevel1[o1] + o2;
4396 if (mlevel2[i2] == 0xFF)
4397 mlevel2[i2] = count3++;
4398 o3 = decode[i] & 0x7F;
4399 i3 = 128*mlevel2[i2] + o3;
4400 mlevel3[i3] = i;
4401 }
4402 return result;
4403}
4404
4405static int
4406encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4407{
4408 struct encoding_map *map = (struct encoding_map*)mapping;
4409 int l1 = c>>11;
4410 int l2 = (c>>7) & 0xF;
4411 int l3 = c & 0x7F;
4412 int i;
4413
4414#ifdef Py_UNICODE_WIDE
4415 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004416 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004417 }
4418#endif
4419 if (c == 0)
4420 return 0;
4421 /* level 1*/
4422 i = map->level1[l1];
4423 if (i == 0xFF) {
4424 return -1;
4425 }
4426 /* level 2*/
4427 i = map->level23[16*i+l2];
4428 if (i == 0xFF) {
4429 return -1;
4430 }
4431 /* level 3 */
4432 i = map->level23[16*map->count2 + 128*i + l3];
4433 if (i == 0) {
4434 return -1;
4435 }
4436 return i;
4437}
4438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439/* Lookup the character ch in the mapping. If the character
4440 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004441 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 PyObject *w = PyInt_FromLong((long)c);
4445 PyObject *x;
4446
4447 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004448 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 x = PyObject_GetItem(mapping, w);
4450 Py_DECREF(w);
4451 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004452 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4453 /* No mapping found means: mapping is undefined. */
4454 PyErr_Clear();
4455 x = Py_None;
4456 Py_INCREF(x);
4457 return x;
4458 } else
4459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004461 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004464 long value = PyInt_AS_LONG(x);
4465 if (value < 0 || value > 255) {
4466 PyErr_SetString(PyExc_TypeError,
4467 "character mapping must be in range(256)");
4468 Py_DECREF(x);
4469 return NULL;
4470 }
4471 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004473 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004474 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004476 /* wrong return value */
4477 PyErr_SetString(PyExc_TypeError,
4478 "character mapping must return integer, None or str");
4479 Py_DECREF(x);
4480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 }
4482}
4483
Martin v. Löwis3f767792006-06-04 19:36:28 +00004484static int
4485charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4486{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004487 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4488 /* exponentially overallocate to minimize reallocations */
4489 if (requiredsize < 2*outsize)
4490 requiredsize = 2*outsize;
4491 if (_PyString_Resize(outobj, requiredsize)) {
4492 return 0;
4493 }
4494 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004495}
4496
Benjamin Peterson857ce152009-01-31 16:29:18 +00004497typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004498 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004499}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500/* lookup the character, put the result in the output string and adjust
4501 various state variables. Reallocate the output string if not enough
4502 space is available. Return a new reference to the object that
4503 was put in the output buffer, or Py_None, if the mapping was undefined
4504 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004505 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004507charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004508 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004510 PyObject *rep;
4511 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004512 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513
Christian Heimese93237d2007-12-19 02:37:44 +00004514 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004515 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004516 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004517 if (res == -1)
4518 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004519 if (outsize<requiredsize)
4520 if (!charmapencode_resize(outobj, outpos, requiredsize))
4521 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004522 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004523 outstart[(*outpos)++] = (char)res;
4524 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004525 }
4526
4527 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004530 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 Py_DECREF(rep);
4532 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004533 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 if (PyInt_Check(rep)) {
4535 Py_ssize_t requiredsize = *outpos+1;
4536 if (outsize<requiredsize)
4537 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4538 Py_DECREF(rep);
4539 return enc_EXCEPTION;
4540 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004541 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004543 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004544 else {
4545 const char *repchars = PyString_AS_STRING(rep);
4546 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4547 Py_ssize_t requiredsize = *outpos+repsize;
4548 if (outsize<requiredsize)
4549 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4550 Py_DECREF(rep);
4551 return enc_EXCEPTION;
4552 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004553 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004554 memcpy(outstart + *outpos, repchars, repsize);
4555 *outpos += repsize;
4556 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 }
Georg Brandl9f167602006-06-04 21:46:16 +00004558 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004559 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560}
4561
4562/* handle an error in PyUnicode_EncodeCharmap
4563 Return 0 on success, -1 on error */
4564static
4565int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004568 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570{
4571 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004572 Py_ssize_t repsize;
4573 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 Py_UNICODE *uni2;
4575 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004576 Py_ssize_t collstartpos = *inpos;
4577 Py_ssize_t collendpos = *inpos+1;
4578 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 char *encoding = "charmap";
4580 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004581 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 /* find all unencodable characters */
4584 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004585 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004586 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004587 int res = encoding_map_lookup(p[collendpos], mapping);
4588 if (res != -1)
4589 break;
4590 ++collendpos;
4591 continue;
4592 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004593
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004594 rep = charmapencode_lookup(p[collendpos], mapping);
4595 if (rep==NULL)
4596 return -1;
4597 else if (rep!=Py_None) {
4598 Py_DECREF(rep);
4599 break;
4600 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004601 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004602 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 }
4604 /* cache callback name lookup
4605 * (if not done yet, i.e. it's the first error) */
4606 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004607 if ((errors==NULL) || (!strcmp(errors, "strict")))
4608 *known_errorHandler = 1;
4609 else if (!strcmp(errors, "replace"))
4610 *known_errorHandler = 2;
4611 else if (!strcmp(errors, "ignore"))
4612 *known_errorHandler = 3;
4613 else if (!strcmp(errors, "xmlcharrefreplace"))
4614 *known_errorHandler = 4;
4615 else
4616 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 }
4618 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004619 case 1: /* strict */
4620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4621 return -1;
4622 case 2: /* replace */
4623 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004624 x = charmapencode_output('?', mapping, res, respos);
4625 if (x==enc_EXCEPTION) {
4626 return -1;
4627 }
4628 else if (x==enc_FAILED) {
4629 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4630 return -1;
4631 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004632 }
4633 /* fall through */
4634 case 3: /* ignore */
4635 *inpos = collendpos;
4636 break;
4637 case 4: /* xmlcharrefreplace */
4638 /* generate replacement (temporarily (mis)uses p) */
4639 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004640 char buffer[2+29+1+1];
4641 char *cp;
4642 sprintf(buffer, "&#%d;", (int)p[collpos]);
4643 for (cp = buffer; *cp; ++cp) {
4644 x = charmapencode_output(*cp, mapping, res, respos);
4645 if (x==enc_EXCEPTION)
4646 return -1;
4647 else if (x==enc_FAILED) {
4648 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4649 return -1;
4650 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004651 }
4652 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004653 *inpos = collendpos;
4654 break;
4655 default:
4656 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004657 encoding, reason, p, size, exceptionObject,
4658 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004659 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004660 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004661 /* generate replacement */
4662 repsize = PyUnicode_GET_SIZE(repunicode);
4663 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004664 x = charmapencode_output(*uni2, mapping, res, respos);
4665 if (x==enc_EXCEPTION) {
4666 return -1;
4667 }
4668 else if (x==enc_FAILED) {
4669 Py_DECREF(repunicode);
4670 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4671 return -1;
4672 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004673 }
4674 *inpos = newpos;
4675 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 }
4677 return 0;
4678}
4679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004681 Py_ssize_t size,
4682 PyObject *mapping,
4683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 /* output object */
4686 PyObject *res = NULL;
4687 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 PyObject *errorHandler = NULL;
4692 PyObject *exc = NULL;
4693 /* the following variable is used for caching string comparisons
4694 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4695 * 3=ignore, 4=xmlcharrefreplace */
4696 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
4698 /* Default to Latin-1 */
4699 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004700 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 /* allocate enough for a simple encoding without
4703 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004704 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 if (res == NULL)
4706 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004707 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004708 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 /* try to encode it */
4712 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4713 if (x==enc_EXCEPTION) /* error */
4714 goto onError;
4715 if (x==enc_FAILED) { /* unencodable character */
4716 if (charmap_encoding_error(p, size, &inpos, mapping,
4717 &exc,
4718 &known_errorHandler, &errorHandler, errors,
4719 &res, &respos)) {
4720 goto onError;
4721 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004722 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004723 else
4724 /* done with this character => adjust input position */
4725 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004729 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004730 if (_PyString_Resize(&res, respos))
4731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 }
4733 Py_XDECREF(exc);
4734 Py_XDECREF(errorHandler);
4735 return res;
4736
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004737 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 Py_XDECREF(res);
4739 Py_XDECREF(exc);
4740 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 return NULL;
4742}
4743
4744PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746{
4747 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 PyErr_BadArgument();
4749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 }
4751 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004752 PyUnicode_GET_SIZE(unicode),
4753 mapping,
4754 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755}
4756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757/* create or adjust a UnicodeTranslateError */
4758static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004759 const Py_UNICODE *unicode, Py_ssize_t size,
4760 Py_ssize_t startpos, Py_ssize_t endpos,
4761 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004764 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004765 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 }
4767 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004768 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4769 goto onError;
4770 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4771 goto onError;
4772 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4773 goto onError;
4774 return;
4775 onError:
4776 Py_DECREF(*exceptionObject);
4777 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 }
4779}
4780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781/* raises a UnicodeTranslateError */
4782static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 const Py_UNICODE *unicode, Py_ssize_t size,
4784 Py_ssize_t startpos, Py_ssize_t endpos,
4785 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786{
4787 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004790 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791}
4792
4793/* error handling callback helper:
4794 build arguments, call the callback and check the arguments,
4795 put the result into newpos and return the replacement string, which
4796 has to be freed by the caller */
4797static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004798 PyObject **errorHandler,
4799 const char *reason,
4800 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4801 Py_ssize_t startpos, Py_ssize_t endpos,
4802 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004804 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805
Martin v. Löwis412fb672006-04-13 06:34:32 +00004806 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 PyObject *restuple;
4808 PyObject *resunicode;
4809
4810 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 }
4815
4816 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820
4821 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004826 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 Py_DECREF(restuple);
4828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 }
4830 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 &resunicode, &i_newpos)) {
4832 Py_DECREF(restuple);
4833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004835 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 else
4838 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004839 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4841 Py_DECREF(restuple);
4842 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 Py_INCREF(resunicode);
4845 Py_DECREF(restuple);
4846 return resunicode;
4847}
4848
4849/* Lookup the character ch in the mapping and put the result in result,
4850 which must be decrefed by the caller.
4851 Return 0 on success, -1 on error */
4852static
4853int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4854{
4855 PyObject *w = PyInt_FromLong((long)c);
4856 PyObject *x;
4857
4858 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004859 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 x = PyObject_GetItem(mapping, w);
4861 Py_DECREF(w);
4862 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4864 /* No mapping found means: use 1:1 mapping. */
4865 PyErr_Clear();
4866 *result = NULL;
4867 return 0;
4868 } else
4869 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 }
4871 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 *result = x;
4873 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 }
4875 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004876 long value = PyInt_AS_LONG(x);
4877 long max = PyUnicode_GetMax();
4878 if (value < 0 || value > max) {
4879 PyErr_Format(PyExc_TypeError,
4880 "character mapping must be in range(0x%lx)", max+1);
4881 Py_DECREF(x);
4882 return -1;
4883 }
4884 *result = x;
4885 return 0;
4886 }
4887 else if (PyUnicode_Check(x)) {
4888 *result = x;
4889 return 0;
4890 }
4891 else {
4892 /* wrong return value */
4893 PyErr_SetString(PyExc_TypeError,
4894 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004895 Py_DECREF(x);
4896 return -1;
4897 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898}
4899/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004900 if not reallocate and adjust various state variables.
4901 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902static
Walter Dörwald4894c302003-10-24 14:25:28 +00004903int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004904 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004906 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004907 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004908 /* remember old output position */
4909 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4910 /* exponentially overallocate to minimize reallocations */
4911 if (requiredsize < 2 * oldsize)
4912 requiredsize = 2 * oldsize;
4913 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4914 return -1;
4915 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 }
4917 return 0;
4918}
4919/* lookup the character, put the result in the output string and adjust
4920 various state variables. Return a new reference to the object that
4921 was put in the output buffer in *result, or Py_None, if the mapping was
4922 undefined (in which case no character was written).
4923 The called must decref result.
4924 Return 0 on success, -1 on error. */
4925static
Walter Dörwald4894c302003-10-24 14:25:28 +00004926int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4928 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929{
Walter Dörwald4894c302003-10-24 14:25:28 +00004930 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004931 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004933 /* not found => default to 1:1 mapping */
4934 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 }
4936 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004937 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004939 /* no overflow check, because we know that the space is enough */
4940 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 }
4942 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004943 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4944 if (repsize==1) {
4945 /* no overflow check, because we know that the space is enough */
4946 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4947 }
4948 else if (repsize!=0) {
4949 /* more than one character */
4950 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4951 (insize - (curinp-startinp)) +
4952 repsize - 1;
4953 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4954 return -1;
4955 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4956 *outp += repsize;
4957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 }
4959 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 return 0;
4962}
4963
4964PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004965 Py_ssize_t size,
4966 PyObject *mapping,
4967 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 /* output object */
4970 PyObject *res = NULL;
4971 /* pointers to the beginning and end+1 of input */
4972 const Py_UNICODE *startp = p;
4973 const Py_UNICODE *endp = p + size;
4974 /* pointer into the output */
4975 Py_UNICODE *str;
4976 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004977 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 char *reason = "character maps to <undefined>";
4979 PyObject *errorHandler = NULL;
4980 PyObject *exc = NULL;
4981 /* the following variable is used for caching string comparisons
4982 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4983 * 3=ignore, 4=xmlcharrefreplace */
4984 int known_errorHandler = -1;
4985
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004987 PyErr_BadArgument();
4988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990
4991 /* allocate enough for a simple 1:1 translation without
4992 replacements, if we need more, we'll resize */
4993 res = PyUnicode_FromUnicode(NULL, size);
4994 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004995 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004997 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005001 /* try to encode it */
5002 PyObject *x = NULL;
5003 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5004 Py_XDECREF(x);
5005 goto onError;
5006 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005007 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005008 if (x!=Py_None) /* it worked => adjust input pointer */
5009 ++p;
5010 else { /* untranslatable character */
5011 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5012 Py_ssize_t repsize;
5013 Py_ssize_t newpos;
5014 Py_UNICODE *uni2;
5015 /* startpos for collecting untranslatable chars */
5016 const Py_UNICODE *collstart = p;
5017 const Py_UNICODE *collend = p+1;
5018 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005020 /* find all untranslatable characters */
5021 while (collend < endp) {
5022 if (charmaptranslate_lookup(*collend, mapping, &x))
5023 goto onError;
5024 Py_XDECREF(x);
5025 if (x!=Py_None)
5026 break;
5027 ++collend;
5028 }
5029 /* cache callback name lookup
5030 * (if not done yet, i.e. it's the first error) */
5031 if (known_errorHandler==-1) {
5032 if ((errors==NULL) || (!strcmp(errors, "strict")))
5033 known_errorHandler = 1;
5034 else if (!strcmp(errors, "replace"))
5035 known_errorHandler = 2;
5036 else if (!strcmp(errors, "ignore"))
5037 known_errorHandler = 3;
5038 else if (!strcmp(errors, "xmlcharrefreplace"))
5039 known_errorHandler = 4;
5040 else
5041 known_errorHandler = 0;
5042 }
5043 switch (known_errorHandler) {
5044 case 1: /* strict */
5045 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005046 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005047 case 2: /* replace */
5048 /* No need to check for space, this is a 1:1 replacement */
5049 for (coll = collstart; coll<collend; ++coll)
5050 *str++ = '?';
5051 /* fall through */
5052 case 3: /* ignore */
5053 p = collend;
5054 break;
5055 case 4: /* xmlcharrefreplace */
5056 /* generate replacement (temporarily (mis)uses p) */
5057 for (p = collstart; p < collend; ++p) {
5058 char buffer[2+29+1+1];
5059 char *cp;
5060 sprintf(buffer, "&#%d;", (int)*p);
5061 if (charmaptranslate_makespace(&res, &str,
5062 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5063 goto onError;
5064 for (cp = buffer; *cp; ++cp)
5065 *str++ = *cp;
5066 }
5067 p = collend;
5068 break;
5069 default:
5070 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5071 reason, startp, size, &exc,
5072 collstart-startp, collend-startp, &newpos);
5073 if (repunicode == NULL)
5074 goto onError;
5075 /* generate replacement */
5076 repsize = PyUnicode_GET_SIZE(repunicode);
5077 if (charmaptranslate_makespace(&res, &str,
5078 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5079 Py_DECREF(repunicode);
5080 goto onError;
5081 }
5082 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5083 *str++ = *uni2;
5084 p = startp + newpos;
5085 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005086 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005087 }
5088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 /* Resize if we allocated to much */
5090 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005091 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005092 if (PyUnicode_Resize(&res, respos) < 0)
5093 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 }
5095 Py_XDECREF(exc);
5096 Py_XDECREF(errorHandler);
5097 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005099 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 Py_XDECREF(res);
5101 Py_XDECREF(exc);
5102 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 return NULL;
5104}
5105
5106PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005107 PyObject *mapping,
5108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109{
5110 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005111
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 str = PyUnicode_FromObject(str);
5113 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005116 PyUnicode_GET_SIZE(str),
5117 mapping,
5118 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 Py_DECREF(str);
5120 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005121
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005122 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 Py_XDECREF(str);
5124 return NULL;
5125}
Tim Petersced69f82003-09-16 20:30:58 +00005126
Guido van Rossum9e896b32000-04-05 20:11:21 +00005127/* --- Decimal Encoder ---------------------------------------------------- */
5128
5129int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005130 Py_ssize_t length,
5131 char *output,
5132 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005133{
5134 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 PyObject *errorHandler = NULL;
5136 PyObject *exc = NULL;
5137 const char *encoding = "decimal";
5138 const char *reason = "invalid decimal Unicode string";
5139 /* the following variable is used for caching string comparisons
5140 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5141 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005142
5143 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005144 PyErr_BadArgument();
5145 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005146 }
5147
5148 p = s;
5149 end = s + length;
5150 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 register Py_UNICODE ch = *p;
5152 int decimal;
5153 PyObject *repunicode;
5154 Py_ssize_t repsize;
5155 Py_ssize_t newpos;
5156 Py_UNICODE *uni2;
5157 Py_UNICODE *collstart;
5158 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005159
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005161 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005162 ++p;
5163 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005164 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005165 decimal = Py_UNICODE_TODECIMAL(ch);
5166 if (decimal >= 0) {
5167 *output++ = '0' + decimal;
5168 ++p;
5169 continue;
5170 }
5171 if (0 < ch && ch < 256) {
5172 *output++ = (char)ch;
5173 ++p;
5174 continue;
5175 }
5176 /* All other characters are considered unencodable */
5177 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005178 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005179 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005180 Py_UNICODE_ISSPACE(*collend) ||
5181 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005182 break;
5183 }
5184 /* cache callback name lookup
5185 * (if not done yet, i.e. it's the first error) */
5186 if (known_errorHandler==-1) {
5187 if ((errors==NULL) || (!strcmp(errors, "strict")))
5188 known_errorHandler = 1;
5189 else if (!strcmp(errors, "replace"))
5190 known_errorHandler = 2;
5191 else if (!strcmp(errors, "ignore"))
5192 known_errorHandler = 3;
5193 else if (!strcmp(errors, "xmlcharrefreplace"))
5194 known_errorHandler = 4;
5195 else
5196 known_errorHandler = 0;
5197 }
5198 switch (known_errorHandler) {
5199 case 1: /* strict */
5200 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5201 goto onError;
5202 case 2: /* replace */
5203 for (p = collstart; p < collend; ++p)
5204 *output++ = '?';
5205 /* fall through */
5206 case 3: /* ignore */
5207 p = collend;
5208 break;
5209 case 4: /* xmlcharrefreplace */
5210 /* generate replacement (temporarily (mis)uses p) */
5211 for (p = collstart; p < collend; ++p)
5212 output += sprintf(output, "&#%d;", (int)*p);
5213 p = collend;
5214 break;
5215 default:
5216 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5217 encoding, reason, s, length, &exc,
5218 collstart-s, collend-s, &newpos);
5219 if (repunicode == NULL)
5220 goto onError;
5221 /* generate replacement */
5222 repsize = PyUnicode_GET_SIZE(repunicode);
5223 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5224 Py_UNICODE ch = *uni2;
5225 if (Py_UNICODE_ISSPACE(ch))
5226 *output++ = ' ';
5227 else {
5228 decimal = Py_UNICODE_TODECIMAL(ch);
5229 if (decimal >= 0)
5230 *output++ = '0' + decimal;
5231 else if (0 < ch && ch < 256)
5232 *output++ = (char)ch;
5233 else {
5234 Py_DECREF(repunicode);
5235 raise_encode_exception(&exc, encoding,
5236 s, length, collstart-s, collend-s, reason);
5237 goto onError;
5238 }
5239 }
5240 }
5241 p = s + newpos;
5242 Py_DECREF(repunicode);
5243 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005244 }
5245 /* 0-terminate the output string */
5246 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 Py_XDECREF(exc);
5248 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005249 return 0;
5250
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005251 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 Py_XDECREF(exc);
5253 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005254 return -1;
5255}
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257/* --- Helpers ------------------------------------------------------------ */
5258
Eric Smitha9f7d622008-02-17 19:46:49 +00005259#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005260#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005261
5262#include "stringlib/count.h"
5263#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005264#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005265#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005266
Fredrik Lundhc8162812006-05-26 19:33:03 +00005267/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005268#define ADJUST_INDICES(start, end, len) \
5269 if (end > len) \
5270 end = len; \
5271 else if (end < 0) { \
5272 end += len; \
5273 if (end < 0) \
5274 end = 0; \
5275 } \
5276 if (start < 0) { \
5277 start += len; \
5278 if (start < 0) \
5279 start = 0; \
5280 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005281
Martin v. Löwis18e16552006-02-15 17:27:45 +00005282Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005283 PyObject *substr,
5284 Py_ssize_t start,
5285 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005287 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005288 PyUnicodeObject* str_obj;
5289 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005290
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005291 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5292 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005293 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005294 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5295 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005296 Py_DECREF(str_obj);
5297 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 }
Tim Petersced69f82003-09-16 20:30:58 +00005299
Antoine Pitrou64672132010-01-13 07:55:48 +00005300 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005301 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005302 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5303 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005304 );
5305
5306 Py_DECREF(sub_obj);
5307 Py_DECREF(str_obj);
5308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 return result;
5310}
5311
Martin v. Löwis18e16552006-02-15 17:27:45 +00005312Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005313 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005314 Py_ssize_t start,
5315 Py_ssize_t end,
5316 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005319
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005320 str = PyUnicode_FromObject(str);
5321 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005322 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005323 sub = PyUnicode_FromObject(sub);
5324 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005325 Py_DECREF(str);
5326 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 }
Tim Petersced69f82003-09-16 20:30:58 +00005328
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005329 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005330 result = stringlib_find_slice(
5331 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5332 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5333 start, end
5334 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005335 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005336 result = stringlib_rfind_slice(
5337 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5338 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5339 start, end
5340 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005341
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005342 Py_DECREF(str);
5343 Py_DECREF(sub);
5344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 return result;
5346}
5347
Tim Petersced69f82003-09-16 20:30:58 +00005348static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005350 PyUnicodeObject *substring,
5351 Py_ssize_t start,
5352 Py_ssize_t end,
5353 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 if (substring->length == 0)
5356 return 1;
5357
Antoine Pitrou64672132010-01-13 07:55:48 +00005358 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 end -= substring->length;
5360 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005361 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
5363 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 if (Py_UNICODE_MATCH(self, end, substring))
5365 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 } else {
5367 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005368 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 }
5370
5371 return 0;
5372}
5373
Martin v. Löwis18e16552006-02-15 17:27:45 +00005374Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005375 PyObject *substr,
5376 Py_ssize_t start,
5377 Py_ssize_t end,
5378 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 str = PyUnicode_FromObject(str);
5383 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 substr = PyUnicode_FromObject(substr);
5386 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005387 Py_DECREF(str);
5388 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 }
Tim Petersced69f82003-09-16 20:30:58 +00005390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005392 (PyUnicodeObject *)substr,
5393 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 Py_DECREF(str);
5395 Py_DECREF(substr);
5396 return result;
5397}
5398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399/* Apply fixfct filter to the Unicode object self and return a
5400 reference to the modified object */
5401
Tim Petersced69f82003-09-16 20:30:58 +00005402static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005404 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405{
5406
5407 PyUnicodeObject *u;
5408
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005409 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005411 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005412
5413 Py_UNICODE_COPY(u->str, self->str, self->length);
5414
Tim Peters7a29bd52001-09-12 03:03:31 +00005415 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005416 /* fixfct should return TRUE if it modified the buffer. If
5417 FALSE, return a reference to the original buffer instead
5418 (to save space, not time) */
5419 Py_INCREF(self);
5420 Py_DECREF(u);
5421 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
5423 return (PyObject*) u;
5424}
5425
Tim Petersced69f82003-09-16 20:30:58 +00005426static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427int fixupper(PyUnicodeObject *self)
5428{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005429 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_UNICODE *s = self->str;
5431 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005432
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005434 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005435
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005436 ch = Py_UNICODE_TOUPPER(*s);
5437 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005439 *s = ch;
5440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 s++;
5442 }
5443
5444 return status;
5445}
5446
Tim Petersced69f82003-09-16 20:30:58 +00005447static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448int fixlower(PyUnicodeObject *self)
5449{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005450 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 Py_UNICODE *s = self->str;
5452 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005453
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005455 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005456
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005457 ch = Py_UNICODE_TOLOWER(*s);
5458 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005460 *s = ch;
5461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 s++;
5463 }
5464
5465 return status;
5466}
5467
Tim Petersced69f82003-09-16 20:30:58 +00005468static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469int fixswapcase(PyUnicodeObject *self)
5470{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005471 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 Py_UNICODE *s = self->str;
5473 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005474
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 while (len-- > 0) {
5476 if (Py_UNICODE_ISUPPER(*s)) {
5477 *s = Py_UNICODE_TOLOWER(*s);
5478 status = 1;
5479 } else if (Py_UNICODE_ISLOWER(*s)) {
5480 *s = Py_UNICODE_TOUPPER(*s);
5481 status = 1;
5482 }
5483 s++;
5484 }
5485
5486 return status;
5487}
5488
Tim Petersced69f82003-09-16 20:30:58 +00005489static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490int fixcapitalize(PyUnicodeObject *self)
5491{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005492 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005493 Py_UNICODE *s = self->str;
5494 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005495
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005496 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005497 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005498 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005499 *s = Py_UNICODE_TOUPPER(*s);
5500 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005502 s++;
5503 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005504 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005505 *s = Py_UNICODE_TOLOWER(*s);
5506 status = 1;
5507 }
5508 s++;
5509 }
5510 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511}
5512
5513static
5514int fixtitle(PyUnicodeObject *self)
5515{
5516 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5517 register Py_UNICODE *e;
5518 int previous_is_cased;
5519
5520 /* Shortcut for single character strings */
5521 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005522 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5523 if (*p != ch) {
5524 *p = ch;
5525 return 1;
5526 }
5527 else
5528 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 }
Tim Petersced69f82003-09-16 20:30:58 +00005530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 e = p + PyUnicode_GET_SIZE(self);
5532 previous_is_cased = 0;
5533 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005534 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005535
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 if (previous_is_cased)
5537 *p = Py_UNICODE_TOLOWER(ch);
5538 else
5539 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005540
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005541 if (Py_UNICODE_ISLOWER(ch) ||
5542 Py_UNICODE_ISUPPER(ch) ||
5543 Py_UNICODE_ISTITLE(ch))
5544 previous_is_cased = 1;
5545 else
5546 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 }
5548 return 1;
5549}
5550
Tim Peters8ce9f162004-08-27 01:49:32 +00005551PyObject *
5552PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
Tim Peters8ce9f162004-08-27 01:49:32 +00005554 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005555 const Py_UNICODE blank = ' ';
5556 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005557 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005558 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005559 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5560 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005561 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5562 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005564 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005565 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 fseq = PySequence_Fast(seq, "");
5568 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005569 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005570 }
5571
Tim Peters91879ab2004-08-27 22:35:44 +00005572 /* Grrrr. A codec may be invoked to convert str objects to
5573 * Unicode, and so it's possible to call back into Python code
5574 * during PyUnicode_FromObject(), and so it's possible for a sick
5575 * codec to change the size of fseq (if seq is a list). Therefore
5576 * we have to keep refetching the size -- can't assume seqlen
5577 * is invariant.
5578 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005579 seqlen = PySequence_Fast_GET_SIZE(fseq);
5580 /* If empty sequence, return u"". */
5581 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005582 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5583 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 }
5585 /* If singleton sequence with an exact Unicode, return that. */
5586 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005587 item = PySequence_Fast_GET_ITEM(fseq, 0);
5588 if (PyUnicode_CheckExact(item)) {
5589 Py_INCREF(item);
5590 res = (PyUnicodeObject *)item;
5591 goto Done;
5592 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005593 }
5594
Tim Peters05eba1f2004-08-27 21:32:02 +00005595 /* At least two items to join, or one that isn't exact Unicode. */
5596 if (seqlen > 1) {
5597 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005598 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005599 sep = &blank;
5600 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005601 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005602 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005603 internal_separator = PyUnicode_FromObject(separator);
5604 if (internal_separator == NULL)
5605 goto onError;
5606 sep = PyUnicode_AS_UNICODE(internal_separator);
5607 seplen = PyUnicode_GET_SIZE(internal_separator);
5608 /* In case PyUnicode_FromObject() mutated seq. */
5609 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005610 }
5611 }
5612
5613 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005614 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005615 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005616 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005617 res_p = PyUnicode_AS_UNICODE(res);
5618 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005619
Tim Peters05eba1f2004-08-27 21:32:02 +00005620 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005621 Py_ssize_t itemlen;
5622 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005623
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005624 item = PySequence_Fast_GET_ITEM(fseq, i);
5625 /* Convert item to Unicode. */
5626 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5627 PyErr_Format(PyExc_TypeError,
5628 "sequence item %zd: expected string or Unicode,"
5629 " %.80s found",
5630 i, Py_TYPE(item)->tp_name);
5631 goto onError;
5632 }
5633 item = PyUnicode_FromObject(item);
5634 if (item == NULL)
5635 goto onError;
5636 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005637
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005638 /* In case PyUnicode_FromObject() mutated seq. */
5639 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005640
Tim Peters8ce9f162004-08-27 01:49:32 +00005641 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005642 itemlen = PyUnicode_GET_SIZE(item);
5643 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005644 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005645 goto Overflow;
5646 if (i < seqlen - 1) {
5647 new_res_used += seplen;
5648 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005649 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005650 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005651 if (new_res_used > res_alloc) {
5652 /* double allocated size until it's big enough */
5653 do {
5654 res_alloc += res_alloc;
5655 if (res_alloc <= 0)
5656 goto Overflow;
5657 } while (new_res_used > res_alloc);
5658 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5659 Py_DECREF(item);
5660 goto onError;
5661 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005662 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005663 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005664
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005665 /* Copy item, and maybe the separator. */
5666 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5667 res_p += itemlen;
5668 if (i < seqlen - 1) {
5669 Py_UNICODE_COPY(res_p, sep, seplen);
5670 res_p += seplen;
5671 }
5672 Py_DECREF(item);
5673 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005675
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 /* Shrink res to match the used area; this probably can't fail,
5677 * but it's cheap to check.
5678 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005679 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005680 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005681
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005683 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 return (PyObject *)res;
5686
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005687 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005688 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005689 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005690 Py_DECREF(item);
5691 /* fall through */
5692
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005693 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005694 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005695 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005696 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 return NULL;
5698}
5699
Tim Petersced69f82003-09-16 20:30:58 +00005700static
5701PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005702 Py_ssize_t left,
5703 Py_ssize_t right,
5704 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705{
5706 PyUnicodeObject *u;
5707
5708 if (left < 0)
5709 left = 0;
5710 if (right < 0)
5711 right = 0;
5712
Tim Peters7a29bd52001-09-12 03:03:31 +00005713 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 Py_INCREF(self);
5715 return self;
5716 }
5717
Neal Norwitze7d8be82008-07-31 17:17:14 +00005718 if (left > PY_SSIZE_T_MAX - self->length ||
5719 right > PY_SSIZE_T_MAX - (left + self->length)) {
5720 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5721 return NULL;
5722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 u = _PyUnicode_New(left + self->length + right);
5724 if (u) {
5725 if (left)
5726 Py_UNICODE_FILL(u->str, fill, left);
5727 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5728 if (right)
5729 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5730 }
5731
5732 return u;
5733}
5734
Antoine Pitrou64672132010-01-13 07:55:48 +00005735PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739 string = PyUnicode_FromObject(string);
5740 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Antoine Pitrou64672132010-01-13 07:55:48 +00005743 list = stringlib_splitlines(
5744 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5745 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
5747 Py_DECREF(string);
5748 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749}
5750
Tim Petersced69f82003-09-16 20:30:58 +00005751static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005753 PyUnicodeObject *substring,
5754 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005757 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005760 return stringlib_split_whitespace(
5761 (PyObject*) self, self->str, self->length, maxcount
5762 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
Antoine Pitrou64672132010-01-13 07:55:48 +00005764 return stringlib_split(
5765 (PyObject*) self, self->str, self->length,
5766 substring->str, substring->length,
5767 maxcount
5768 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769}
5770
Tim Petersced69f82003-09-16 20:30:58 +00005771static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005773 PyUnicodeObject *substring,
5774 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005777 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005780 return stringlib_rsplit_whitespace(
5781 (PyObject*) self, self->str, self->length, maxcount
5782 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005783
Antoine Pitrou64672132010-01-13 07:55:48 +00005784 return stringlib_rsplit(
5785 (PyObject*) self, self->str, self->length,
5786 substring->str, substring->length,
5787 maxcount
5788 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789}
5790
5791static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005793 PyUnicodeObject *str1,
5794 PyUnicodeObject *str2,
5795 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
5797 PyUnicodeObject *u;
5798
5799 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005800 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005801 else if (maxcount == 0 || self->length == 0)
5802 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Fredrik Lundh347ee272006-05-24 16:35:18 +00005804 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005805 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005806 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005807 if (str1->length == 0)
5808 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005809 if (str1->length == 1) {
5810 /* replace characters */
5811 Py_UNICODE u1, u2;
5812 if (!findchar(self->str, self->length, str1->str[0]))
5813 goto nothing;
5814 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5815 if (!u)
5816 return NULL;
5817 Py_UNICODE_COPY(u->str, self->str, self->length);
5818 u1 = str1->str[0];
5819 u2 = str2->str[0];
5820 for (i = 0; i < u->length; i++)
5821 if (u->str[i] == u1) {
5822 if (--maxcount < 0)
5823 break;
5824 u->str[i] = u2;
5825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005827 i = stringlib_find(
5828 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005830 if (i < 0)
5831 goto nothing;
5832 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5833 if (!u)
5834 return NULL;
5835 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005836
5837 /* change everything in-place, starting with this one */
5838 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5839 i += str1->length;
5840
5841 while ( --maxcount > 0) {
5842 i = stringlib_find(self->str+i, self->length-i,
5843 str1->str, str1->length,
5844 i);
5845 if (i == -1)
5846 break;
5847 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5848 i += str1->length;
5849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005852
Brett Cannona7f13ee2010-05-04 01:16:51 +00005853 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005854 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 Py_UNICODE *p;
5856
5857 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005858 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5859 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005860 if (n == 0)
5861 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005862 /* new_size = self->length + n * (str2->length - str1->length)); */
5863 delta = (str2->length - str1->length);
5864 if (delta == 0) {
5865 new_size = self->length;
5866 } else {
5867 product = n * (str2->length - str1->length);
5868 if ((product / (str2->length - str1->length)) != n) {
5869 PyErr_SetString(PyExc_OverflowError,
5870 "replace string is too long");
5871 return NULL;
5872 }
5873 new_size = self->length + product;
5874 if (new_size < 0) {
5875 PyErr_SetString(PyExc_OverflowError,
5876 "replace string is too long");
5877 return NULL;
5878 }
5879 }
5880 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005881 if (!u)
5882 return NULL;
5883 i = 0;
5884 p = u->str;
5885 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005886 while (n-- > 0) {
5887 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005888 j = stringlib_find(self->str+i, self->length-i,
5889 str1->str, str1->length,
5890 i);
5891 if (j == -1)
5892 break;
5893 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005894 /* copy unchanged part [i:j] */
5895 Py_UNICODE_COPY(p, self->str+i, j-i);
5896 p += j - i;
5897 }
5898 /* copy substitution string */
5899 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 Py_UNICODE_COPY(p, str2->str, str2->length);
5901 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005902 }
5903 i = j + str1->length;
5904 }
5905 if (i < self->length)
5906 /* copy tail [i:] */
5907 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005908 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005909 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005910 while (n > 0) {
5911 Py_UNICODE_COPY(p, str2->str, str2->length);
5912 p += str2->length;
5913 if (--n <= 0)
5914 break;
5915 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 }
5919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005922 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005923 /* nothing to replace; return original string (when possible) */
5924 if (PyUnicode_CheckExact(self)) {
5925 Py_INCREF(self);
5926 return (PyObject *) self;
5927 }
5928 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
5931/* --- Unicode Object Methods --------------------------------------------- */
5932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005933PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005934 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935\n\
5936Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938
5939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005940unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return fixup(self, fixtitle);
5943}
5944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005946 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947\n\
5948Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005949have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005952unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 return fixup(self, fixcapitalize);
5955}
5956
5957#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005959 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960\n\
5961Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005962normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
5964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005965unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
5967 PyObject *list;
5968 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 /* Split into words */
5972 list = split(self, NULL, -1);
5973 if (!list)
5974 return NULL;
5975
5976 /* Capitalize each word */
5977 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5978 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005979 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (item == NULL)
5981 goto onError;
5982 Py_DECREF(PyList_GET_ITEM(list, i));
5983 PyList_SET_ITEM(list, i, item);
5984 }
5985
5986 /* Join the words to form a new string */
5987 item = PyUnicode_Join(NULL, list);
5988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 Py_DECREF(list);
5991 return (PyObject *)item;
5992}
5993#endif
5994
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005995/* Argument converter. Coerces to a single unicode character */
5996
5997static int
5998convert_uc(PyObject *obj, void *addr)
5999{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006000 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6001 PyObject *uniobj;
6002 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006003
Benjamin Peterson857ce152009-01-31 16:29:18 +00006004 uniobj = PyUnicode_FromObject(obj);
6005 if (uniobj == NULL) {
6006 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006007 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006008 return 0;
6009 }
6010 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6011 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006012 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006013 Py_DECREF(uniobj);
6014 return 0;
6015 }
6016 unistr = PyUnicode_AS_UNICODE(uniobj);
6017 *fillcharloc = unistr[0];
6018 Py_DECREF(uniobj);
6019 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006020}
6021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006022PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006023 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006025Return S centered in a Unicode string of length width. Padding is\n\
6026done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
6028static PyObject *
6029unicode_center(PyUnicodeObject *self, PyObject *args)
6030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006031 Py_ssize_t marg, left;
6032 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006033 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Thomas Woutersde017742006-02-16 19:34:37 +00006035 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
6037
Tim Peters7a29bd52001-09-12 03:03:31 +00006038 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 Py_INCREF(self);
6040 return (PyObject*) self;
6041 }
6042
6043 marg = width - self->length;
6044 left = marg / 2 + (marg & width & 1);
6045
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006046 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047}
6048
Marc-André Lemburge5034372000-08-08 08:04:29 +00006049#if 0
6050
6051/* This code should go into some future Unicode collation support
6052 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006053 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006054
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006055/* speedy UTF-16 code point order comparison */
6056/* gleaned from: */
6057/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6058
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006059static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006060{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006061 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006062 0, 0, 0, 0, 0, 0, 0, 0,
6063 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006064 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006065};
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067static int
6068unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6069{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006070 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_UNICODE *s1 = str1->str;
6073 Py_UNICODE *s2 = str2->str;
6074
6075 len1 = str1->length;
6076 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006079 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006080
6081 c1 = *s1++;
6082 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006083
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006084 if (c1 > (1<<11) * 26)
6085 c1 += utf16Fixup[c1>>11];
6086 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006089
6090 if (c1 != c2)
6091 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006092
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006093 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
6095
6096 return (len1 < len2) ? -1 : (len1 != len2);
6097}
6098
Marc-André Lemburge5034372000-08-08 08:04:29 +00006099#else
6100
6101static int
6102unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6103{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006104 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006105
6106 Py_UNICODE *s1 = str1->str;
6107 Py_UNICODE *s2 = str2->str;
6108
6109 len1 = str1->length;
6110 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Marc-André Lemburge5034372000-08-08 08:04:29 +00006112 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006113 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006114
Fredrik Lundh45714e92001-06-26 16:39:36 +00006115 c1 = *s1++;
6116 c2 = *s2++;
6117
6118 if (c1 != c2)
6119 return (c1 < c2) ? -1 : 1;
6120
Marc-André Lemburge5034372000-08-08 08:04:29 +00006121 len1--; len2--;
6122 }
6123
6124 return (len1 < len2) ? -1 : (len1 != len2);
6125}
6126
6127#endif
6128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006130 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
6132 PyUnicodeObject *u = NULL, *v = NULL;
6133 int result;
6134
6135 /* Coerce the two arguments */
6136 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6137 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6140 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Thomas Wouters7e474022000-07-16 12:04:32 +00006143 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006145 Py_DECREF(u);
6146 Py_DECREF(v);
6147 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
6149
6150 result = unicode_compare(u, v);
6151
6152 Py_DECREF(u);
6153 Py_DECREF(v);
6154 return result;
6155
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_XDECREF(u);
6158 Py_XDECREF(v);
6159 return -1;
6160}
6161
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006162PyObject *PyUnicode_RichCompare(PyObject *left,
6163 PyObject *right,
6164 int op)
6165{
6166 int result;
6167
6168 result = PyUnicode_Compare(left, right);
6169 if (result == -1 && PyErr_Occurred())
6170 goto onError;
6171
6172 /* Convert the return value to a Boolean */
6173 switch (op) {
6174 case Py_EQ:
6175 result = (result == 0);
6176 break;
6177 case Py_NE:
6178 result = (result != 0);
6179 break;
6180 case Py_LE:
6181 result = (result <= 0);
6182 break;
6183 case Py_GE:
6184 result = (result >= 0);
6185 break;
6186 case Py_LT:
6187 result = (result == -1);
6188 break;
6189 case Py_GT:
6190 result = (result == 1);
6191 break;
6192 }
6193 return PyBool_FromLong(result);
6194
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006195 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006196
6197 /* Standard case
6198
6199 Type errors mean that PyUnicode_FromObject() could not convert
6200 one of the arguments (usually the right hand side) to Unicode,
6201 ie. we can't handle the comparison request. However, it is
6202 possible that the other object knows a comparison method, which
6203 is why we return Py_NotImplemented to give the other object a
6204 chance.
6205
6206 */
6207 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6208 PyErr_Clear();
6209 Py_INCREF(Py_NotImplemented);
6210 return Py_NotImplemented;
6211 }
6212 if (op != Py_EQ && op != Py_NE)
6213 return NULL;
6214
6215 /* Equality comparison.
6216
6217 This is a special case: we silence any PyExc_UnicodeDecodeError
6218 and instead turn it into a PyErr_UnicodeWarning.
6219
6220 */
6221 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6222 return NULL;
6223 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006224 if (PyErr_Warn(PyExc_UnicodeWarning,
6225 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006226 "Unicode equal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal" :
6229 "Unicode unequal comparison "
6230 "failed to convert both arguments to Unicode - "
6231 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006232 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006233 return NULL;
6234 result = (op == Py_NE);
6235 return PyBool_FromLong(result);
6236}
6237
Guido van Rossum403d68b2000-03-13 15:55:09 +00006238int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006239 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006240{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006241 PyObject *str, *sub;
6242 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006243
6244 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006245 sub = PyUnicode_FromObject(element);
6246 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006247 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006248 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006249
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006250 str = PyUnicode_FromObject(container);
6251 if (!str) {
6252 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006253 return -1;
6254 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006255
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006256 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006257
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006258 Py_DECREF(str);
6259 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006260
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006261 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006262}
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264/* Concat to string or Unicode object giving a new Unicode object. */
6265
6266PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006267 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
6269 PyUnicodeObject *u = NULL, *v = NULL, *w;
6270
6271 /* Coerce the two arguments */
6272 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6273 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6276 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278
6279 /* Shortcuts */
6280 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 Py_DECREF(v);
6282 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 }
6284 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006285 Py_DECREF(u);
6286 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288
6289 /* Concat the two Unicode strings */
6290 w = _PyUnicode_New(u->length + v->length);
6291 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 Py_UNICODE_COPY(w->str, u->str, u->length);
6294 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6295
6296 Py_DECREF(u);
6297 Py_DECREF(v);
6298 return (PyObject *)w;
6299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006300 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_XDECREF(u);
6302 Py_XDECREF(v);
6303 return NULL;
6304}
6305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006306PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006307 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006309Return the number of non-overlapping occurrences of substring sub in\n\
6310Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006311interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313static PyObject *
6314unicode_count(PyUnicodeObject *self, PyObject *args)
6315{
6316 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006317 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006318 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 PyObject *result;
6320
Jesus Cea44e81682011-04-20 16:39:15 +02006321 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6322 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006323 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006324
Antoine Pitrou64672132010-01-13 07:55:48 +00006325 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006326 result = PyInt_FromSsize_t(
6327 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006328 substring->str, substring->length,
6329 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006330 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
6332 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 return result;
6335}
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006338 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006340Encodes S using the codec registered for encoding. encoding defaults\n\
6341to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006342handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006343a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6344'xmlcharrefreplace' as well as any other name registered with\n\
6345codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346
6347static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006348unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006350 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 char *encoding = NULL;
6352 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006353 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006354
Benjamin Peterson332d7212009-09-18 21:14:55 +00006355 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6356 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006358 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006359 if (v == NULL)
6360 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006361 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006362 PyErr_Format(PyExc_TypeError,
6363 "encoder did not return a string/unicode object "
6364 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006365 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006366 Py_DECREF(v);
6367 return NULL;
6368 }
6369 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006370
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006371 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006372 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006373}
6374
6375PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006376 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006377\n\
6378Decodes S using the codec registered for encoding. encoding defaults\n\
6379to the default encoding. errors may be given to set a different error\n\
6380handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6381a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006382as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006383able to handle UnicodeDecodeErrors.");
6384
6385static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006386unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006387{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006388 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389 char *encoding = NULL;
6390 char *errors = NULL;
6391 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006392
Benjamin Peterson332d7212009-09-18 21:14:55 +00006393 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6394 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006395 return NULL;
6396 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006397 if (v == NULL)
6398 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006399 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400 PyErr_Format(PyExc_TypeError,
6401 "decoder did not return a string/unicode object "
6402 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006403 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006404 Py_DECREF(v);
6405 return NULL;
6406 }
6407 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006408
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006409 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411}
6412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006413PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006414 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415\n\
6416Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006417If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
6419static PyObject*
6420unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6421{
6422 Py_UNICODE *e;
6423 Py_UNICODE *p;
6424 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006425 Py_UNICODE *qe;
6426 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 PyUnicodeObject *u;
6428 int tabsize = 8;
6429
6430 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
Thomas Wouters7e474022000-07-16 12:04:32 +00006433 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006434 i = 0; /* chars up to and including most recent \n or \r */
6435 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6436 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 for (p = self->str; p < e; p++)
6438 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006439 if (tabsize > 0) {
6440 incr = tabsize - (j % tabsize); /* cannot overflow */
6441 if (j > PY_SSIZE_T_MAX - incr)
6442 goto overflow1;
6443 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006444 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006447 if (j > PY_SSIZE_T_MAX - 1)
6448 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 j++;
6450 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006451 if (i > PY_SSIZE_T_MAX - j)
6452 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006454 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 }
6456 }
6457
Guido van Rossum5bdff602008-03-11 21:18:06 +00006458 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006459 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* Second pass: create output string and fill it */
6462 u = _PyUnicode_New(i + j);
6463 if (!u)
6464 return NULL;
6465
Guido van Rossum5bdff602008-03-11 21:18:06 +00006466 j = 0; /* same as in first pass */
6467 q = u->str; /* next output char */
6468 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469
6470 for (p = self->str; p < e; p++)
6471 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006472 if (tabsize > 0) {
6473 i = tabsize - (j % tabsize);
6474 j += i;
6475 while (i--) {
6476 if (q >= qe)
6477 goto overflow2;
6478 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006479 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006480 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006481 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006482 else {
6483 if (q >= qe)
6484 goto overflow2;
6485 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006486 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 if (*p == '\n' || *p == '\r')
6488 j = 0;
6489 }
6490
6491 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006492
6493 overflow2:
6494 Py_DECREF(u);
6495 overflow1:
6496 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006500PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006501 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502\n\
6503Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006504such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505arguments start and end are interpreted as in slice notation.\n\
6506\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006507Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
6509static PyObject *
6510unicode_find(PyUnicodeObject *self, PyObject *args)
6511{
Jesus Cea44e81682011-04-20 16:39:15 +02006512 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006513 Py_ssize_t start;
6514 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006515 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
Jesus Cea44e81682011-04-20 16:39:15 +02006517 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6518 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006521 result = stringlib_find_slice(
6522 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6523 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6524 start, end
6525 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
6527 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528
6529 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
6532static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006533unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534{
6535 if (index < 0 || index >= self->length) {
6536 PyErr_SetString(PyExc_IndexError, "string index out of range");
6537 return NULL;
6538 }
6539
6540 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6541}
6542
6543static long
6544unicode_hash(PyUnicodeObject *self)
6545{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006546 /* Since Unicode objects compare equal to their ASCII string
6547 counterparts, they should use the individual character values
6548 as basis for their hash value. This is needed to assure that
6549 strings and Unicode objects behave in the same way as
6550 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006553 register Py_UNICODE *p;
6554 register long x;
6555
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006556#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006557 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006558#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006560 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006561 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006562 /*
6563 We make the hash of the empty string be 0, rather than using
6564 (prefix ^ suffix), since this slightly obfuscates the hash secret
6565 */
6566 if (len == 0) {
6567 self->hash = 0;
6568 return 0;
6569 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006571 x = _Py_HashSecret.prefix;
6572 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006573 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006574 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006575 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006576 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006577 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006578 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006579 self->hash = x;
6580 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581}
6582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006584 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006586Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588static PyObject *
6589unicode_index(PyUnicodeObject *self, PyObject *args)
6590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006592 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006593 Py_ssize_t start;
6594 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
Jesus Cea44e81682011-04-20 16:39:15 +02006596 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6597 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006600 result = stringlib_find_slice(
6601 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6602 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6603 start, end
6604 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
6606 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 if (result < 0) {
6609 PyErr_SetString(PyExc_ValueError, "substring not found");
6610 return NULL;
6611 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006612
Martin v. Löwis18e16552006-02-15 17:27:45 +00006613 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614}
6615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006617 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006619Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621
6622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006623unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
6625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6626 register const Py_UNICODE *e;
6627 int cased;
6628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 /* Shortcut for single character strings */
6630 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006631 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006634 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 e = p + PyUnicode_GET_SIZE(self);
6638 cased = 0;
6639 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006640 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006641
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006642 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6643 return PyBool_FromLong(0);
6644 else if (!cased && Py_UNICODE_ISLOWER(ch))
6645 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006647 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006651 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006653Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006657unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
6659 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6660 register const Py_UNICODE *e;
6661 int cased;
6662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 /* Shortcut for single character strings */
6664 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006667 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006668 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 e = p + PyUnicode_GET_SIZE(self);
6672 cased = 0;
6673 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006674 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006676 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6677 return PyBool_FromLong(0);
6678 else if (!cased && Py_UNICODE_ISUPPER(ch))
6679 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006681 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682}
6683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006684PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006687Return True if S is a titlecased string and there is at least one\n\
6688character in S, i.e. upper- and titlecase characters may only\n\
6689follow uncased characters and lowercase characters only cased ones.\n\
6690Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
6692static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006693unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694{
6695 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6696 register const Py_UNICODE *e;
6697 int cased, previous_is_cased;
6698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 /* Shortcut for single character strings */
6700 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006701 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6702 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006704 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006705 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006706 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 e = p + PyUnicode_GET_SIZE(self);
6709 cased = 0;
6710 previous_is_cased = 0;
6711 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006712 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006713
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006714 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6715 if (previous_is_cased)
6716 return PyBool_FromLong(0);
6717 previous_is_cased = 1;
6718 cased = 1;
6719 }
6720 else if (Py_UNICODE_ISLOWER(ch)) {
6721 if (!previous_is_cased)
6722 return PyBool_FromLong(0);
6723 previous_is_cased = 1;
6724 cased = 1;
6725 }
6726 else
6727 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006729 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006735Return True if all characters in S are whitespace\n\
6736and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
6738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006739unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
6741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6742 register const Py_UNICODE *e;
6743
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 /* Shortcut for single character strings */
6745 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006746 Py_UNICODE_ISSPACE(*p))
6747 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006749 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006750 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 e = p + PyUnicode_GET_SIZE(self);
6754 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 if (!Py_UNICODE_ISSPACE(*p))
6756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006758 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006761PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006764Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006766
6767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006768unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006769{
6770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6771 register const Py_UNICODE *e;
6772
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006773 /* Shortcut for single character strings */
6774 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006775 Py_UNICODE_ISALPHA(*p))
6776 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006777
6778 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006779 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006780 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006781
6782 e = p + PyUnicode_GET_SIZE(self);
6783 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006784 if (!Py_UNICODE_ISALPHA(*p))
6785 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006787 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006788}
6789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006790PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006793Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795
6796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006797unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798{
6799 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6800 register const Py_UNICODE *e;
6801
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802 /* Shortcut for single character strings */
6803 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006804 Py_UNICODE_ISALNUM(*p))
6805 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806
6807 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006808 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006809 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006810
6811 e = p + PyUnicode_GET_SIZE(self);
6812 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006813 if (!Py_UNICODE_ISALNUM(*p))
6814 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006817}
6818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006820 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
6825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006826unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
6828 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6829 register const Py_UNICODE *e;
6830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 /* Shortcut for single character strings */
6832 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006833 Py_UNICODE_ISDECIMAL(*p))
6834 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006836 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006837 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006838 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 e = p + PyUnicode_GET_SIZE(self);
6841 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006842 if (!Py_UNICODE_ISDECIMAL(*p))
6843 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846}
6847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006851Return True if all characters in S are digits\n\
6852and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853
6854static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006855unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
6857 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6858 register const Py_UNICODE *e;
6859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 /* Shortcut for single character strings */
6861 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006862 Py_UNICODE_ISDIGIT(*p))
6863 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006865 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006866 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006867 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 e = p + PyUnicode_GET_SIZE(self);
6870 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 if (!Py_UNICODE_ISDIGIT(*p))
6872 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875}
6876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006877PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006878 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
6883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006884unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
6886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6887 register const Py_UNICODE *e;
6888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 /* Shortcut for single character strings */
6890 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006891 Py_UNICODE_ISNUMERIC(*p))
6892 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006894 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006895 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 e = p + PyUnicode_GET_SIZE(self);
6899 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006900 if (!Py_UNICODE_ISNUMERIC(*p))
6901 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006907 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908\n\
6909Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006910iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006913unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006915 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916}
6917
Martin v. Löwis18e16552006-02-15 17:27:45 +00006918static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919unicode_length(PyUnicodeObject *self)
6920{
6921 return self->length;
6922}
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006925 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006927Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006928done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930static PyObject *
6931unicode_ljust(PyUnicodeObject *self, PyObject *args)
6932{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006933 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006934 Py_UNICODE fillchar = ' ';
6935
Martin v. Löwis412fb672006-04-13 06:34:32 +00006936 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 return NULL;
6938
Tim Peters7a29bd52001-09-12 03:03:31 +00006939 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 Py_INCREF(self);
6941 return (PyObject*) self;
6942 }
6943
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006944 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945}
6946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006948 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006950Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006953unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 return fixup(self, fixlower);
6956}
6957
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006958#define LEFTSTRIP 0
6959#define RIGHTSTRIP 1
6960#define BOTHSTRIP 2
6961
6962/* Arrays indexed by above */
6963static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6964
6965#define STRIPNAME(i) (stripformat[i]+3)
6966
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006967/* externally visible for str.strip(unicode) */
6968PyObject *
6969_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6970{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006971 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6972 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6973 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6974 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6975 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006976
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006977 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006978
Benjamin Peterson857ce152009-01-31 16:29:18 +00006979 i = 0;
6980 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6982 i++;
6983 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006984 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006985
Benjamin Peterson857ce152009-01-31 16:29:18 +00006986 j = len;
6987 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006988 do {
6989 j--;
6990 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6991 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006992 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006993
Benjamin Peterson857ce152009-01-31 16:29:18 +00006994 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006995 Py_INCREF(self);
6996 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006997 }
6998 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006999 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007000}
7001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002
7003static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007004do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7007 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007008
Benjamin Peterson857ce152009-01-31 16:29:18 +00007009 i = 0;
7010 if (striptype != RIGHTSTRIP) {
7011 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7012 i++;
7013 }
7014 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007015
Benjamin Peterson857ce152009-01-31 16:29:18 +00007016 j = len;
7017 if (striptype != LEFTSTRIP) {
7018 do {
7019 j--;
7020 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7021 j++;
7022 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023
Benjamin Peterson857ce152009-01-31 16:29:18 +00007024 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7025 Py_INCREF(self);
7026 return (PyObject*)self;
7027 }
7028 else
7029 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030}
7031
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032
7033static PyObject *
7034do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7035{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007036 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037
Benjamin Peterson857ce152009-01-31 16:29:18 +00007038 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7039 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040
Benjamin Peterson857ce152009-01-31 16:29:18 +00007041 if (sep != NULL && sep != Py_None) {
7042 if (PyUnicode_Check(sep))
7043 return _PyUnicode_XStrip(self, striptype, sep);
7044 else if (PyString_Check(sep)) {
7045 PyObject *res;
7046 sep = PyUnicode_FromObject(sep);
7047 if (sep==NULL)
7048 return NULL;
7049 res = _PyUnicode_XStrip(self, striptype, sep);
7050 Py_DECREF(sep);
7051 return res;
7052 }
7053 else {
7054 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007055 "%s arg must be None, unicode or str",
7056 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007057 return NULL;
7058 }
7059 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060
Benjamin Peterson857ce152009-01-31 16:29:18 +00007061 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062}
7063
7064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007065PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007066 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067\n\
7068Return a copy of the string S with leading and trailing\n\
7069whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007070If chars is given and not None, remove characters in chars instead.\n\
7071If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072
7073static PyObject *
7074unicode_strip(PyUnicodeObject *self, PyObject *args)
7075{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007076 if (PyTuple_GET_SIZE(args) == 0)
7077 return do_strip(self, BOTHSTRIP); /* Common case */
7078 else
7079 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080}
7081
7082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007084 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085\n\
7086Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007087If chars is given and not None, remove characters in chars instead.\n\
7088If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089
7090static PyObject *
7091unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7092{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007093 if (PyTuple_GET_SIZE(args) == 0)
7094 return do_strip(self, LEFTSTRIP); /* Common case */
7095 else
7096 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097}
7098
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007101 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102\n\
7103Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007104If chars is given and not None, remove characters in chars instead.\n\
7105If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106
7107static PyObject *
7108unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7109{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007110 if (PyTuple_GET_SIZE(args) == 0)
7111 return do_strip(self, RIGHTSTRIP); /* Common case */
7112 else
7113 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114}
7115
7116
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119{
7120 PyUnicodeObject *u;
7121 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007123 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124
7125 if (len < 0)
7126 len = 0;
7127
Tim Peters7a29bd52001-09-12 03:03:31 +00007128 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 /* no repeat, return original string */
7130 Py_INCREF(str);
7131 return (PyObject*) str;
7132 }
Tim Peters8f422462000-09-09 06:13:41 +00007133
7134 /* ensure # of chars needed doesn't overflow int and # of bytes
7135 * needed doesn't overflow size_t
7136 */
7137 nchars = len * str->length;
7138 if (len && nchars / len != str->length) {
7139 PyErr_SetString(PyExc_OverflowError,
7140 "repeated string is too long");
7141 return NULL;
7142 }
7143 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7144 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7145 PyErr_SetString(PyExc_OverflowError,
7146 "repeated string is too long");
7147 return NULL;
7148 }
7149 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 if (!u)
7151 return NULL;
7152
7153 p = u->str;
7154
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007155 if (str->length == 1 && len > 0) {
7156 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007157 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007158 Py_ssize_t done = 0; /* number of characters copied this far */
7159 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007160 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007161 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007162 }
7163 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007164 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007165 Py_UNICODE_COPY(p+done, p, n);
7166 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
7170 return (PyObject*) u;
7171}
7172
7173PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007174 PyObject *subobj,
7175 PyObject *replobj,
7176 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177{
7178 PyObject *self;
7179 PyObject *str1;
7180 PyObject *str2;
7181 PyObject *result;
7182
7183 self = PyUnicode_FromObject(obj);
7184 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 str1 = PyUnicode_FromObject(subobj);
7187 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007188 Py_DECREF(self);
7189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 }
7191 str2 = PyUnicode_FromObject(replobj);
7192 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007193 Py_DECREF(self);
7194 Py_DECREF(str1);
7195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 }
Tim Petersced69f82003-09-16 20:30:58 +00007197 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007198 (PyUnicodeObject *)str1,
7199 (PyUnicodeObject *)str2,
7200 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 Py_DECREF(self);
7202 Py_DECREF(str1);
7203 Py_DECREF(str2);
7204 return result;
7205}
7206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007207PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007208 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209\n\
7210Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007211old replaced by new. If the optional argument count is\n\
7212given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214static PyObject*
7215unicode_replace(PyUnicodeObject *self, PyObject *args)
7216{
7217 PyUnicodeObject *str1;
7218 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 PyObject *result;
7221
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 return NULL;
7224 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7225 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007228 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007229 Py_DECREF(str1);
7230 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232
7233 result = replace(self, str1, str2, maxcount);
7234
7235 Py_DECREF(str1);
7236 Py_DECREF(str2);
7237 return result;
7238}
7239
7240static
7241PyObject *unicode_repr(PyObject *unicode)
7242{
7243 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007244 PyUnicode_GET_SIZE(unicode),
7245 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246}
7247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007248PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250\n\
7251Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007252such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253arguments start and end are interpreted as in slice notation.\n\
7254\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007255Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
7257static PyObject *
7258unicode_rfind(PyUnicodeObject *self, PyObject *args)
7259{
Jesus Cea44e81682011-04-20 16:39:15 +02007260 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007261 Py_ssize_t start;
7262 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007263 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
Jesus Cea44e81682011-04-20 16:39:15 +02007265 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7266 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007269 result = stringlib_rfind_slice(
7270 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7271 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7272 start, end
7273 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
7275 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007276
7277 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278}
7279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007281 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007283Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject *
7286unicode_rindex(PyUnicodeObject *self, PyObject *args)
7287{
Jesus Cea44e81682011-04-20 16:39:15 +02007288 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007289 Py_ssize_t start;
7290 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007291 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
Jesus Cea44e81682011-04-20 16:39:15 +02007293 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7294 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007297 result = stringlib_rfind_slice(
7298 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7299 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7300 start, end
7301 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302
7303 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007304
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 if (result < 0) {
7306 PyErr_SetString(PyExc_ValueError, "substring not found");
7307 return NULL;
7308 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007309 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310}
7311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007312PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007313 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007315Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007316done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
7318static PyObject *
7319unicode_rjust(PyUnicodeObject *self, PyObject *args)
7320{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007321 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007322 Py_UNICODE fillchar = ' ';
7323
Martin v. Löwis412fb672006-04-13 06:34:32 +00007324 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 return NULL;
7326
Tim Peters7a29bd52001-09-12 03:03:31 +00007327 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 Py_INCREF(self);
7329 return (PyObject*) self;
7330 }
7331
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007332 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007336unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337{
7338 /* standard clamping */
7339 if (start < 0)
7340 start = 0;
7341 if (end < 0)
7342 end = 0;
7343 if (end > self->length)
7344 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007345 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 /* full slice, return original string */
7347 Py_INCREF(self);
7348 return (PyObject*) self;
7349 }
7350 if (start > end)
7351 start = end;
7352 /* copy slice */
7353 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007354 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355}
7356
7357PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007358 PyObject *sep,
7359 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360{
7361 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 s = PyUnicode_FromObject(s);
7364 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007365 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007366 if (sep != NULL) {
7367 sep = PyUnicode_FromObject(sep);
7368 if (sep == NULL) {
7369 Py_DECREF(s);
7370 return NULL;
7371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 }
7373
7374 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7375
7376 Py_DECREF(s);
7377 Py_XDECREF(sep);
7378 return result;
7379}
7380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007381PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007382 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383\n\
7384Return a list of the words in S, using sep as the\n\
7385delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007386splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007387whitespace string is a separator and empty strings are\n\
7388removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390static PyObject*
7391unicode_split(PyUnicodeObject *self, PyObject *args)
7392{
7393 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007394 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
Martin v. Löwis18e16552006-02-15 17:27:45 +00007396 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 return NULL;
7398
7399 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007400 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007402 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007404 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405}
7406
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007407PyObject *
7408PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7409{
7410 PyObject* str_obj;
7411 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007412 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007413
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007414 str_obj = PyUnicode_FromObject(str_in);
7415 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007416 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007417 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007418 if (!sep_obj) {
7419 Py_DECREF(str_obj);
7420 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007421 }
7422
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007423 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007424 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7425 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7426 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007427
Fredrik Lundhb9479482006-05-26 17:22:38 +00007428 Py_DECREF(sep_obj);
7429 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007430
7431 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007432}
7433
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007434
7435PyObject *
7436PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7437{
7438 PyObject* str_obj;
7439 PyObject* sep_obj;
7440 PyObject* out;
7441
7442 str_obj = PyUnicode_FromObject(str_in);
7443 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007444 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007445 sep_obj = PyUnicode_FromObject(sep_in);
7446 if (!sep_obj) {
7447 Py_DECREF(str_obj);
7448 return NULL;
7449 }
7450
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007451 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007452 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7453 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7454 );
7455
7456 Py_DECREF(sep_obj);
7457 Py_DECREF(str_obj);
7458
7459 return out;
7460}
7461
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007462PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007463 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007464\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007465Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007466the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007467found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007468
7469static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007470unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007471{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007472 return PyUnicode_Partition((PyObject *)self, separator);
7473}
7474
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007475PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007476 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007477\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007478Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007479the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007480separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007481
7482static PyObject*
7483unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7484{
7485 return PyUnicode_RPartition((PyObject *)self, separator);
7486}
7487
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007488PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007489 PyObject *sep,
7490 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007491{
7492 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007493
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007494 s = PyUnicode_FromObject(s);
7495 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007496 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007497 if (sep != NULL) {
7498 sep = PyUnicode_FromObject(sep);
7499 if (sep == NULL) {
7500 Py_DECREF(s);
7501 return NULL;
7502 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007503 }
7504
7505 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7506
7507 Py_DECREF(s);
7508 Py_XDECREF(sep);
7509 return result;
7510}
7511
7512PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007513 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514\n\
7515Return a list of the words in S, using sep as the\n\
7516delimiter string, starting at the end of the string and\n\
7517working to the front. If maxsplit is given, at most maxsplit\n\
7518splits are done. If sep is not specified, any whitespace string\n\
7519is a separator.");
7520
7521static PyObject*
7522unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7523{
7524 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007525 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007526
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007528 return NULL;
7529
7530 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007531 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007532 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007533 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007534 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007535 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007536}
7537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007539 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540\n\
7541Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007542Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007543is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
7545static PyObject*
7546unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7547{
Guido van Rossum86662912000-04-11 15:38:46 +00007548 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
Guido van Rossum86662912000-04-11 15:38:46 +00007550 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 return NULL;
7552
Guido van Rossum86662912000-04-11 15:38:46 +00007553 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554}
7555
7556static
7557PyObject *unicode_str(PyUnicodeObject *self)
7558{
Fred Drakee4315f52000-05-09 19:53:39 +00007559 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560}
7561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007563 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564\n\
7565Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007569unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 return fixup(self, fixswapcase);
7572}
7573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007575 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576\n\
7577Return a copy of the string S, where all characters have been mapped\n\
7578through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007579Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7580Unmapped characters are left untouched. Characters mapped to None\n\
7581are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
7583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007584unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585{
Tim Petersced69f82003-09-16 20:30:58 +00007586 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007587 self->length,
7588 table,
7589 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590}
7591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007592PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007593 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
7597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007598unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 return fixup(self, fixupper);
7601}
7602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007603PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007604 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605\n\
Georg Brandl98064072008-09-09 19:26:00 +00007606Pad a numeric string S with zeros on the left, to fill a field\n\
7607of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
7609static PyObject *
7610unicode_zfill(PyUnicodeObject *self, PyObject *args)
7611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007612 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 PyUnicodeObject *u;
7614
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 Py_ssize_t width;
7616 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 return NULL;
7618
7619 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007620 if (PyUnicode_CheckExact(self)) {
7621 Py_INCREF(self);
7622 return (PyObject*) self;
7623 }
7624 else
7625 return PyUnicode_FromUnicode(
7626 PyUnicode_AS_UNICODE(self),
7627 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007628 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 }
7630
7631 fill = width - self->length;
7632
7633 u = pad(self, fill, 0, '0');
7634
Walter Dörwald068325e2002-04-15 13:36:47 +00007635 if (u == NULL)
7636 return NULL;
7637
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (u->str[fill] == '+' || u->str[fill] == '-') {
7639 /* move sign to beginning of string */
7640 u->str[0] = u->str[fill];
7641 u->str[fill] = '0';
7642 }
7643
7644 return (PyObject*) u;
7645}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646
7647#if 0
7648static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007649free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007651 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652}
7653#endif
7654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007655PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007656 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007658Return True if S starts with the specified prefix, False otherwise.\n\
7659With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007660With optional end, stop comparing S at that position.\n\
7661prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663static PyObject *
7664unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007665 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666{
Georg Brandl24250812006-06-09 18:45:48 +00007667 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007670 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007671 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Jesus Cea44e81682011-04-20 16:39:15 +02007673 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007674 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007675 if (PyTuple_Check(subobj)) {
7676 Py_ssize_t i;
7677 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7678 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007680 if (substring == NULL)
7681 return NULL;
7682 result = tailmatch(self, substring, start, end, -1);
7683 Py_DECREF(substring);
7684 if (result) {
7685 Py_RETURN_TRUE;
7686 }
7687 }
7688 /* nothing matched */
7689 Py_RETURN_FALSE;
7690 }
7691 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007692 if (substring == NULL) {
7693 if (PyErr_ExceptionMatches(PyExc_TypeError))
7694 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7695 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007696 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007697 }
Georg Brandl24250812006-06-09 18:45:48 +00007698 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007700 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701}
7702
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007705 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007707Return True if S ends with the specified suffix, False otherwise.\n\
7708With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007709With optional end, stop comparing S at that position.\n\
7710suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
7712static PyObject *
7713unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007714 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715{
Georg Brandl24250812006-06-09 18:45:48 +00007716 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007718 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007719 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007720 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
Jesus Cea44e81682011-04-20 16:39:15 +02007722 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007723 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007724 if (PyTuple_Check(subobj)) {
7725 Py_ssize_t i;
7726 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7727 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007728 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007729 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007730 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007731 result = tailmatch(self, substring, start, end, +1);
7732 Py_DECREF(substring);
7733 if (result) {
7734 Py_RETURN_TRUE;
7735 }
7736 }
7737 Py_RETURN_FALSE;
7738 }
7739 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007740 if (substring == NULL) {
7741 if (PyErr_ExceptionMatches(PyExc_TypeError))
7742 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7743 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007744 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007745 }
Georg Brandl24250812006-06-09 18:45:48 +00007746 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007748 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749}
7750
7751
Eric Smitha9f7d622008-02-17 19:46:49 +00007752/* Implements do_string_format, which is unicode because of stringlib */
7753#include "stringlib/string_format.h"
7754
7755PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007756 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007757\n\
Eric Smith6c840852010-11-06 19:43:44 +00007758Return a formatted version of S, using substitutions from args and kwargs.\n\
7759The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007760
Eric Smithdc13b792008-05-30 18:10:04 +00007761static PyObject *
7762unicode__format__(PyObject *self, PyObject *args)
7763{
7764 PyObject *format_spec;
7765 PyObject *result = NULL;
7766 PyObject *tmp = NULL;
7767
7768 /* If 2.x, convert format_spec to the same type as value */
7769 /* This is to allow things like u''.format('') */
7770 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7771 goto done;
7772 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7773 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007774 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007775 goto done;
7776 }
7777 tmp = PyObject_Unicode(format_spec);
7778 if (tmp == NULL)
7779 goto done;
7780 format_spec = tmp;
7781
7782 result = _PyUnicode_FormatAdvanced(self,
7783 PyUnicode_AS_UNICODE(format_spec),
7784 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007785 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007786 Py_XDECREF(tmp);
7787 return result;
7788}
7789
Eric Smitha9f7d622008-02-17 19:46:49 +00007790PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007791 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007792\n\
Eric Smith6c840852010-11-06 19:43:44 +00007793Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007794
Robert Schuppenies901c9972008-06-10 10:10:31 +00007795static PyObject *
7796unicode__sizeof__(PyUnicodeObject *v)
7797{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007798 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7799 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007800}
7801
7802PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007803 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007804\n\
7805");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007806
7807static PyObject *
7808unicode_getnewargs(PyUnicodeObject *v)
7809{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007810 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007811}
7812
7813
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007815 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007816 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7817 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007818 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007819 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7820 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7821 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7822 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7823 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7824 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7825 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007826 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007827 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7828 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7829 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007830 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007831 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7833 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7834 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7835 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007836 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007837 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007838 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007840 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7841 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7842 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7843 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7844 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7845 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7846 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7847 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7848 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7849 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7850 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7851 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7852 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7853 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007855 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7856 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7857 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7858 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007859 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007860#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007861 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862#endif
7863
7864#if 0
7865 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007866 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867#endif
7868
Benjamin Peterson857ce152009-01-31 16:29:18 +00007869 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 {NULL, NULL}
7871};
7872
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007873static PyObject *
7874unicode_mod(PyObject *v, PyObject *w)
7875{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007876 if (!PyUnicode_Check(v)) {
7877 Py_INCREF(Py_NotImplemented);
7878 return Py_NotImplemented;
7879 }
7880 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007881}
7882
7883static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007884 0, /*nb_add*/
7885 0, /*nb_subtract*/
7886 0, /*nb_multiply*/
7887 0, /*nb_divide*/
7888 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007889};
7890
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007892 (lenfunc) unicode_length, /* sq_length */
7893 PyUnicode_Concat, /* sq_concat */
7894 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7895 (ssizeargfunc) unicode_getitem, /* sq_item */
7896 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7897 0, /* sq_ass_item */
7898 0, /* sq_ass_slice */
7899 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900};
7901
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007902static PyObject*
7903unicode_subscript(PyUnicodeObject* self, PyObject* item)
7904{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007905 if (PyIndex_Check(item)) {
7906 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007907 if (i == -1 && PyErr_Occurred())
7908 return NULL;
7909 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007910 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007911 return unicode_getitem(self, i);
7912 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007914 Py_UNICODE* source_buf;
7915 Py_UNICODE* result_buf;
7916 PyObject* result;
7917
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007918 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007919 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920 return NULL;
7921 }
7922
7923 if (slicelength <= 0) {
7924 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007925 } else if (start == 0 && step == 1 && slicelength == self->length &&
7926 PyUnicode_CheckExact(self)) {
7927 Py_INCREF(self);
7928 return (PyObject *)self;
7929 } else if (step == 1) {
7930 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007931 } else {
7932 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007933 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7934 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007935
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007936 if (result_buf == NULL)
7937 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007938
7939 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7940 result_buf[i] = source_buf[cur];
7941 }
Tim Petersced69f82003-09-16 20:30:58 +00007942
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007943 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007944 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007945 return result;
7946 }
7947 } else {
7948 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7949 return NULL;
7950 }
7951}
7952
7953static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007954 (lenfunc)unicode_length, /* mp_length */
7955 (binaryfunc)unicode_subscript, /* mp_subscript */
7956 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007957};
7958
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 Py_ssize_t index,
7962 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963{
7964 if (index != 0) {
7965 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007966 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 return -1;
7968 }
7969 *ptr = (void *) self->str;
7970 return PyUnicode_GET_DATA_SIZE(self);
7971}
7972
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973static Py_ssize_t
7974unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976{
7977 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007978 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 return -1;
7980}
7981
7982static int
7983unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007984 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985{
7986 if (lenp)
7987 *lenp = PyUnicode_GET_DATA_SIZE(self);
7988 return 1;
7989}
7990
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007991static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007993 Py_ssize_t index,
7994 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
7996 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 if (index != 0) {
7999 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008000 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 return -1;
8002 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008003 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008005 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008006 *ptr = (void *) PyString_AS_STRING(str);
8007 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008}
8009
8010/* Helpers for PyUnicode_Format() */
8011
8012static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 (*p_argidx)++;
8018 if (arglen < 0)
8019 return args;
8020 else
8021 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
8023 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008024 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 return NULL;
8026}
8027
8028#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008029#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008031#define F_ALT (1<<3)
8032#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008035strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008037 register Py_ssize_t i;
8038 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008040 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 return len;
8043}
8044
Neal Norwitzfc76d632006-01-10 06:03:13 +00008045static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008046longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8047{
Tim Peters15231542006-02-16 01:08:01 +00008048 Py_ssize_t result;
8049
Neal Norwitzfc76d632006-01-10 06:03:13 +00008050 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008051 result = strtounicode(buffer, (char *)buffer);
8052 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008053}
8054
Guido van Rossum078151d2002-08-11 04:24:12 +00008055/* XXX To save some code duplication, formatfloat/long/int could have been
8056 shared with stringobject.c, converting from 8-bit to Unicode after the
8057 formatting is done. */
8058
Mark Dickinson18cfada2009-11-23 18:46:41 +00008059/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8060
8061static PyObject *
8062formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008064 char *p;
8065 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008067
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 x = PyFloat_AsDouble(v);
8069 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008070 return NULL;
8071
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008073 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008074
Mark Dickinson18cfada2009-11-23 18:46:41 +00008075 p = PyOS_double_to_string(x, type, prec,
8076 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8077 if (p == NULL)
8078 return NULL;
8079 result = PyUnicode_FromStringAndSize(p, strlen(p));
8080 PyMem_Free(p);
8081 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082}
8083
Tim Peters38fd5b62000-09-21 05:43:11 +00008084static PyObject*
8085formatlong(PyObject *val, int flags, int prec, int type)
8086{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008087 char *buf;
8088 int i, len;
8089 PyObject *str; /* temporary string object. */
8090 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008091
Benjamin Peterson857ce152009-01-31 16:29:18 +00008092 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8093 if (!str)
8094 return NULL;
8095 result = _PyUnicode_New(len);
8096 if (!result) {
8097 Py_DECREF(str);
8098 return NULL;
8099 }
8100 for (i = 0; i < len; i++)
8101 result->str[i] = buf[i];
8102 result->str[len] = 0;
8103 Py_DECREF(str);
8104 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008105}
8106
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107static int
8108formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008109 size_t buflen,
8110 int flags,
8111 int prec,
8112 int type,
8113 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008115 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008116 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8117 * + 1 + 1
8118 * = 24
8119 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008120 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008121 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 long x;
8123
8124 x = PyInt_AsLong(v);
8125 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008126 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008127 if (x < 0 && type == 'u') {
8128 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008129 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008130 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8131 sign = "-";
8132 else
8133 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008135 prec = 1;
8136
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008137 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8138 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008139 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008140 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008141 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008142 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008143 return -1;
8144 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008145
8146 if ((flags & F_ALT) &&
8147 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008148 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008149 * of issues that cause pain:
8150 * - when 0 is being converted, the C standard leaves off
8151 * the '0x' or '0X', which is inconsistent with other
8152 * %#x/%#X conversions and inconsistent with Python's
8153 * hex() function
8154 * - there are platforms that violate the standard and
8155 * convert 0 with the '0x' or '0X'
8156 * (Metrowerks, Compaq Tru64)
8157 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008158 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008159 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008160 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008161 * We can achieve the desired consistency by inserting our
8162 * own '0x' or '0X' prefix, and substituting %x/%X in place
8163 * of %#x/%#X.
8164 *
8165 * Note that this is the same approach as used in
8166 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008167 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008168 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8169 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008170 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008171 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008172 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8173 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008174 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008175 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008176 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008177 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008178 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008179 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180}
8181
8182static int
8183formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008184 size_t buflen,
8185 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186{
Ezio Melotti32125152010-02-25 17:36:04 +00008187 PyObject *unistr;
8188 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008189 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008190 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008191 if (PyUnicode_GET_SIZE(v) != 1)
8192 goto onError;
8193 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008196 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008197 if (PyString_GET_SIZE(v) != 1)
8198 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008199 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8200 with a UnicodeDecodeError if 'char' is not decodable with the
8201 default encoding (usually ASCII, but it might be something else) */
8202 str = PyString_AS_STRING(v);
8203 if ((unsigned char)str[0] > 0x7F) {
8204 /* the char is not ASCII; try to decode the string using the
8205 default encoding and return -1 to let the UnicodeDecodeError
8206 be raised if the string can't be decoded */
8207 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8208 if (unistr == NULL)
8209 return -1;
8210 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8211 Py_DECREF(unistr);
8212 }
8213 else
8214 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216
8217 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008218 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 x = PyInt_AsLong(v);
8221 if (x == -1 && PyErr_Occurred())
8222 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008223#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008224 if (x < 0 || x > 0x10ffff) {
8225 PyErr_SetString(PyExc_OverflowError,
8226 "%c arg not in range(0x110000) "
8227 "(wide Python build)");
8228 return -1;
8229 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008230#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008231 if (x < 0 || x > 0xffff) {
8232 PyErr_SetString(PyExc_OverflowError,
8233 "%c arg not in range(0x10000) "
8234 "(narrow Python build)");
8235 return -1;
8236 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008237#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008238 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 }
8240 buf[1] = '\0';
8241 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008243 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008244 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008245 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008246 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247}
8248
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008249/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8250
Mark Dickinson18cfada2009-11-23 18:46:41 +00008251 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008252 chars are formatted. XXX This is a magic number. Each formatting
8253 routine does bounds checking to ensure no overflow, but a better
8254 solution may be to malloc a buffer of appropriate size for each
8255 format. For now, the current solution is sufficient.
8256*/
8257#define FORMATBUFLEN (size_t)120
8258
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008260 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261{
8262 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008263 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 int args_owned = 0;
8265 PyUnicodeObject *result = NULL;
8266 PyObject *dict = NULL;
8267 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008270 PyErr_BadInternalCall();
8271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
8273 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008274 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 fmt = PyUnicode_AS_UNICODE(uformat);
8277 fmtcnt = PyUnicode_GET_SIZE(uformat);
8278
8279 reslen = rescnt = fmtcnt + 100;
8280 result = _PyUnicode_New(reslen);
8281 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 res = PyUnicode_AS_UNICODE(result);
8284
8285 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008286 arglen = PyTuple_Size(args);
8287 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
8289 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008290 arglen = -1;
8291 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008293 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008294 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008295 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
8297 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008298 if (*fmt != '%') {
8299 if (--rescnt < 0) {
8300 rescnt = fmtcnt + 100;
8301 reslen += rescnt;
8302 if (_PyUnicode_Resize(&result, reslen) < 0)
8303 goto onError;
8304 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8305 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008306 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008308 }
8309 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008310 /* Got a format specifier */
8311 int flags = 0;
8312 Py_ssize_t width = -1;
8313 int prec = -1;
8314 Py_UNICODE c = '\0';
8315 Py_UNICODE fill;
8316 int isnumok;
8317 PyObject *v = NULL;
8318 PyObject *temp = NULL;
8319 Py_UNICODE *pbuf;
8320 Py_UNICODE sign;
8321 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008322 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008323
8324 fmt++;
8325 if (*fmt == '(') {
8326 Py_UNICODE *keystart;
8327 Py_ssize_t keylen;
8328 PyObject *key;
8329 int pcount = 1;
8330
8331 if (dict == NULL) {
8332 PyErr_SetString(PyExc_TypeError,
8333 "format requires a mapping");
8334 goto onError;
8335 }
8336 ++fmt;
8337 --fmtcnt;
8338 keystart = fmt;
8339 /* Skip over balanced parentheses */
8340 while (pcount > 0 && --fmtcnt >= 0) {
8341 if (*fmt == ')')
8342 --pcount;
8343 else if (*fmt == '(')
8344 ++pcount;
8345 fmt++;
8346 }
8347 keylen = fmt - keystart - 1;
8348 if (fmtcnt < 0 || pcount > 0) {
8349 PyErr_SetString(PyExc_ValueError,
8350 "incomplete format key");
8351 goto onError;
8352 }
8353#if 0
8354 /* keys are converted to strings using UTF-8 and
8355 then looked up since Python uses strings to hold
8356 variables names etc. in its namespaces and we
8357 wouldn't want to break common idioms. */
8358 key = PyUnicode_EncodeUTF8(keystart,
8359 keylen,
8360 NULL);
8361#else
8362 key = PyUnicode_FromUnicode(keystart, keylen);
8363#endif
8364 if (key == NULL)
8365 goto onError;
8366 if (args_owned) {
8367 Py_DECREF(args);
8368 args_owned = 0;
8369 }
8370 args = PyObject_GetItem(dict, key);
8371 Py_DECREF(key);
8372 if (args == NULL) {
8373 goto onError;
8374 }
8375 args_owned = 1;
8376 arglen = -1;
8377 argidx = -2;
8378 }
8379 while (--fmtcnt >= 0) {
8380 switch (c = *fmt++) {
8381 case '-': flags |= F_LJUST; continue;
8382 case '+': flags |= F_SIGN; continue;
8383 case ' ': flags |= F_BLANK; continue;
8384 case '#': flags |= F_ALT; continue;
8385 case '0': flags |= F_ZERO; continue;
8386 }
8387 break;
8388 }
8389 if (c == '*') {
8390 v = getnextarg(args, arglen, &argidx);
8391 if (v == NULL)
8392 goto onError;
8393 if (!PyInt_Check(v)) {
8394 PyErr_SetString(PyExc_TypeError,
8395 "* wants int");
8396 goto onError;
8397 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008398 width = PyInt_AsSsize_t(v);
8399 if (width == -1 && PyErr_Occurred())
8400 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008401 if (width < 0) {
8402 flags |= F_LJUST;
8403 width = -width;
8404 }
8405 if (--fmtcnt >= 0)
8406 c = *fmt++;
8407 }
8408 else if (c >= '0' && c <= '9') {
8409 width = c - '0';
8410 while (--fmtcnt >= 0) {
8411 c = *fmt++;
8412 if (c < '0' || c > '9')
8413 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008414 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008415 PyErr_SetString(PyExc_ValueError,
8416 "width too big");
8417 goto onError;
8418 }
8419 width = width*10 + (c - '0');
8420 }
8421 }
8422 if (c == '.') {
8423 prec = 0;
8424 if (--fmtcnt >= 0)
8425 c = *fmt++;
8426 if (c == '*') {
8427 v = getnextarg(args, arglen, &argidx);
8428 if (v == NULL)
8429 goto onError;
8430 if (!PyInt_Check(v)) {
8431 PyErr_SetString(PyExc_TypeError,
8432 "* wants int");
8433 goto onError;
8434 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008435 prec = _PyInt_AsInt(v);
8436 if (prec == -1 && PyErr_Occurred())
8437 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008438 if (prec < 0)
8439 prec = 0;
8440 if (--fmtcnt >= 0)
8441 c = *fmt++;
8442 }
8443 else if (c >= '0' && c <= '9') {
8444 prec = c - '0';
8445 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008446 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008447 if (c < '0' || c > '9')
8448 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008449 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008450 PyErr_SetString(PyExc_ValueError,
8451 "prec too big");
8452 goto onError;
8453 }
8454 prec = prec*10 + (c - '0');
8455 }
8456 }
8457 } /* prec */
8458 if (fmtcnt >= 0) {
8459 if (c == 'h' || c == 'l' || c == 'L') {
8460 if (--fmtcnt >= 0)
8461 c = *fmt++;
8462 }
8463 }
8464 if (fmtcnt < 0) {
8465 PyErr_SetString(PyExc_ValueError,
8466 "incomplete format");
8467 goto onError;
8468 }
8469 if (c != '%') {
8470 v = getnextarg(args, arglen, &argidx);
8471 if (v == NULL)
8472 goto onError;
8473 }
8474 sign = 0;
8475 fill = ' ';
8476 switch (c) {
8477
8478 case '%':
8479 pbuf = formatbuf;
8480 /* presume that buffer length is at least 1 */
8481 pbuf[0] = '%';
8482 len = 1;
8483 break;
8484
8485 case 's':
8486 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008487 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008488 temp = v;
8489 Py_INCREF(temp);
8490 }
8491 else {
8492 PyObject *unicode;
8493 if (c == 's')
8494 temp = PyObject_Unicode(v);
8495 else
8496 temp = PyObject_Repr(v);
8497 if (temp == NULL)
8498 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008499 if (PyUnicode_Check(temp))
8500 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008501 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008502 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008503 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8504 PyString_GET_SIZE(temp),
8505 NULL,
8506 "strict");
8507 Py_DECREF(temp);
8508 temp = unicode;
8509 if (temp == NULL)
8510 goto onError;
8511 }
8512 else {
8513 Py_DECREF(temp);
8514 PyErr_SetString(PyExc_TypeError,
8515 "%s argument has non-string str()");
8516 goto onError;
8517 }
8518 }
8519 pbuf = PyUnicode_AS_UNICODE(temp);
8520 len = PyUnicode_GET_SIZE(temp);
8521 if (prec >= 0 && len > prec)
8522 len = prec;
8523 break;
8524
8525 case 'i':
8526 case 'd':
8527 case 'u':
8528 case 'o':
8529 case 'x':
8530 case 'X':
8531 if (c == 'i')
8532 c = 'd';
8533 isnumok = 0;
8534 if (PyNumber_Check(v)) {
8535 PyObject *iobj=NULL;
8536
8537 if (PyInt_Check(v) || (PyLong_Check(v))) {
8538 iobj = v;
8539 Py_INCREF(iobj);
8540 }
8541 else {
8542 iobj = PyNumber_Int(v);
8543 if (iobj==NULL) iobj = PyNumber_Long(v);
8544 }
8545 if (iobj!=NULL) {
8546 if (PyInt_Check(iobj)) {
8547 isnumok = 1;
8548 pbuf = formatbuf;
8549 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8550 flags, prec, c, iobj);
8551 Py_DECREF(iobj);
8552 if (len < 0)
8553 goto onError;
8554 sign = 1;
8555 }
8556 else if (PyLong_Check(iobj)) {
8557 isnumok = 1;
8558 temp = formatlong(iobj, flags, prec, c);
8559 Py_DECREF(iobj);
8560 if (!temp)
8561 goto onError;
8562 pbuf = PyUnicode_AS_UNICODE(temp);
8563 len = PyUnicode_GET_SIZE(temp);
8564 sign = 1;
8565 }
8566 else {
8567 Py_DECREF(iobj);
8568 }
8569 }
8570 }
8571 if (!isnumok) {
8572 PyErr_Format(PyExc_TypeError,
8573 "%%%c format: a number is required, "
8574 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8575 goto onError;
8576 }
8577 if (flags & F_ZERO)
8578 fill = '0';
8579 break;
8580
8581 case 'e':
8582 case 'E':
8583 case 'f':
8584 case 'F':
8585 case 'g':
8586 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008587 temp = formatfloat(v, flags, prec, c);
8588 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008589 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008590 pbuf = PyUnicode_AS_UNICODE(temp);
8591 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008592 sign = 1;
8593 if (flags & F_ZERO)
8594 fill = '0';
8595 break;
8596
8597 case 'c':
8598 pbuf = formatbuf;
8599 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8600 if (len < 0)
8601 goto onError;
8602 break;
8603
8604 default:
8605 PyErr_Format(PyExc_ValueError,
8606 "unsupported format character '%c' (0x%x) "
8607 "at index %zd",
8608 (31<=c && c<=126) ? (char)c : '?',
8609 (int)c,
8610 (Py_ssize_t)(fmt - 1 -
8611 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008612 goto onError;
8613 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008614 if (sign) {
8615 if (*pbuf == '-' || *pbuf == '+') {
8616 sign = *pbuf++;
8617 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008618 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008619 else if (flags & F_SIGN)
8620 sign = '+';
8621 else if (flags & F_BLANK)
8622 sign = ' ';
8623 else
8624 sign = 0;
8625 }
8626 if (width < len)
8627 width = len;
8628 if (rescnt - (sign != 0) < width) {
8629 reslen -= rescnt;
8630 rescnt = width + fmtcnt + 100;
8631 reslen += rescnt;
8632 if (reslen < 0) {
8633 Py_XDECREF(temp);
8634 PyErr_NoMemory();
8635 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008636 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008637 if (_PyUnicode_Resize(&result, reslen) < 0) {
8638 Py_XDECREF(temp);
8639 goto onError;
8640 }
8641 res = PyUnicode_AS_UNICODE(result)
8642 + reslen - rescnt;
8643 }
8644 if (sign) {
8645 if (fill != ' ')
8646 *res++ = sign;
8647 rescnt--;
8648 if (width > len)
8649 width--;
8650 }
8651 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8652 assert(pbuf[0] == '0');
8653 assert(pbuf[1] == c);
8654 if (fill != ' ') {
8655 *res++ = *pbuf++;
8656 *res++ = *pbuf++;
8657 }
8658 rescnt -= 2;
8659 width -= 2;
8660 if (width < 0)
8661 width = 0;
8662 len -= 2;
8663 }
8664 if (width > len && !(flags & F_LJUST)) {
8665 do {
8666 --rescnt;
8667 *res++ = fill;
8668 } while (--width > len);
8669 }
8670 if (fill == ' ') {
8671 if (sign)
8672 *res++ = sign;
8673 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8674 assert(pbuf[0] == '0');
8675 assert(pbuf[1] == c);
8676 *res++ = *pbuf++;
8677 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008678 }
8679 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008680 Py_UNICODE_COPY(res, pbuf, len);
8681 res += len;
8682 rescnt -= len;
8683 while (--width >= len) {
8684 --rescnt;
8685 *res++ = ' ';
8686 }
8687 if (dict && (argidx < arglen) && c != '%') {
8688 PyErr_SetString(PyExc_TypeError,
8689 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008690 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008691 goto onError;
8692 }
8693 Py_XDECREF(temp);
8694 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 } /* until end */
8696 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008697 PyErr_SetString(PyExc_TypeError,
8698 "not all arguments converted during string formatting");
8699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 }
8701
Thomas Woutersa96affe2006-03-12 00:29:36 +00008702 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008703 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008705 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 }
8707 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 return (PyObject *)result;
8709
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008710 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 Py_XDECREF(result);
8712 Py_DECREF(uformat);
8713 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008714 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 }
8716 return NULL;
8717}
8718
8719static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008720 (readbufferproc) unicode_buffer_getreadbuf,
8721 (writebufferproc) unicode_buffer_getwritebuf,
8722 (segcountproc) unicode_buffer_getsegcount,
8723 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724};
8725
Jeremy Hylton938ace62002-07-17 16:30:39 +00008726static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008727unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8728
Tim Peters6d6c1a32001-08-02 04:15:00 +00008729static PyObject *
8730unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8731{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008732 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008733 static char *kwlist[] = {"string", "encoding", "errors", 0};
8734 char *encoding = NULL;
8735 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008736
Benjamin Peterson857ce152009-01-31 16:29:18 +00008737 if (type != &PyUnicode_Type)
8738 return unicode_subtype_new(type, args, kwds);
8739 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008740 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008741 return NULL;
8742 if (x == NULL)
8743 return (PyObject *)_PyUnicode_New(0);
8744 if (encoding == NULL && errors == NULL)
8745 return PyObject_Unicode(x);
8746 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008747 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008748}
8749
Guido van Rossume023fe02001-08-30 03:12:59 +00008750static PyObject *
8751unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8752{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008753 PyUnicodeObject *tmp, *pnew;
8754 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008755
Benjamin Peterson857ce152009-01-31 16:29:18 +00008756 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8757 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8758 if (tmp == NULL)
8759 return NULL;
8760 assert(PyUnicode_Check(tmp));
8761 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8762 if (pnew == NULL) {
8763 Py_DECREF(tmp);
8764 return NULL;
8765 }
8766 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8767 if (pnew->str == NULL) {
8768 _Py_ForgetReference((PyObject *)pnew);
8769 PyObject_Del(pnew);
8770 Py_DECREF(tmp);
8771 return PyErr_NoMemory();
8772 }
8773 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8774 pnew->length = n;
8775 pnew->hash = tmp->hash;
8776 Py_DECREF(tmp);
8777 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008778}
8779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008780PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008781 "unicode(object='') -> unicode object\n\
8782unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008783\n\
8784Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008785encoding defaults to the current default string encoding.\n\
8786errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008787
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008789 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008790 "unicode", /* tp_name */
8791 sizeof(PyUnicodeObject), /* tp_size */
8792 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008794 (destructor)unicode_dealloc, /* tp_dealloc */
8795 0, /* tp_print */
8796 0, /* tp_getattr */
8797 0, /* tp_setattr */
8798 0, /* tp_compare */
8799 unicode_repr, /* tp_repr */
8800 &unicode_as_number, /* tp_as_number */
8801 &unicode_as_sequence, /* tp_as_sequence */
8802 &unicode_as_mapping, /* tp_as_mapping */
8803 (hashfunc) unicode_hash, /* tp_hash*/
8804 0, /* tp_call*/
8805 (reprfunc) unicode_str, /* tp_str */
8806 PyObject_GenericGetAttr, /* tp_getattro */
8807 0, /* tp_setattro */
8808 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008809 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008810 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008811 unicode_doc, /* tp_doc */
8812 0, /* tp_traverse */
8813 0, /* tp_clear */
8814 PyUnicode_RichCompare, /* tp_richcompare */
8815 0, /* tp_weaklistoffset */
8816 0, /* tp_iter */
8817 0, /* tp_iternext */
8818 unicode_methods, /* tp_methods */
8819 0, /* tp_members */
8820 0, /* tp_getset */
8821 &PyBaseString_Type, /* tp_base */
8822 0, /* tp_dict */
8823 0, /* tp_descr_get */
8824 0, /* tp_descr_set */
8825 0, /* tp_dictoffset */
8826 0, /* tp_init */
8827 0, /* tp_alloc */
8828 unicode_new, /* tp_new */
8829 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830};
8831
8832/* Initialize the Unicode implementation */
8833
Thomas Wouters78890102000-07-22 19:25:51 +00008834void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008836 /* XXX - move this array to unicodectype.c ? */
8837 Py_UNICODE linebreak[] = {
8838 0x000A, /* LINE FEED */
8839 0x000D, /* CARRIAGE RETURN */
8840 0x001C, /* FILE SEPARATOR */
8841 0x001D, /* GROUP SEPARATOR */
8842 0x001E, /* RECORD SEPARATOR */
8843 0x0085, /* NEXT LINE */
8844 0x2028, /* LINE SEPARATOR */
8845 0x2029, /* PARAGRAPH SEPARATOR */
8846 };
8847
Fred Drakee4315f52000-05-09 19:53:39 +00008848 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008849 if (!unicode_empty) {
8850 unicode_empty = _PyUnicode_New(0);
8851 if (!unicode_empty)
8852 return;
8853 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008854
Guido van Rossumcacfc072002-05-24 19:01:59 +00008855 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008856 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008857
8858 /* initialize the linebreak bloom filter */
8859 bloom_linebreak = make_bloom_mask(
8860 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8861 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008862
8863 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008864
8865 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8866 Py_FatalError("Can't initialize field name iterator type");
8867
8868 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8869 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870}
8871
8872/* Finalize the Unicode implementation */
8873
Christian Heimes3b718a72008-02-14 12:47:33 +00008874int
8875PyUnicode_ClearFreeList(void)
8876{
8877 int freelist_size = numfree;
8878 PyUnicodeObject *u;
8879
8880 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008881 PyUnicodeObject *v = u;
8882 u = *(PyUnicodeObject **)u;
8883 if (v->str)
8884 PyObject_DEL(v->str);
8885 Py_XDECREF(v->defenc);
8886 PyObject_Del(v);
8887 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008888 }
8889 free_list = NULL;
8890 assert(numfree == 0);
8891 return freelist_size;
8892}
8893
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894void
Thomas Wouters78890102000-07-22 19:25:51 +00008895_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008897 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008899 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008900
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008901 for (i = 0; i < 256; i++)
8902 Py_CLEAR(unicode_latin1[i]);
8903
Christian Heimes3b718a72008-02-14 12:47:33 +00008904 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008906
Anthony Baxterac6bd462006-04-13 02:06:09 +00008907#ifdef __cplusplus
8908}
8909#endif