blob: 866eb9b0589e2bf0347e1689dd53e896849727a2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Anthony Baxterac6bd462006-04-13 02:06:09 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Guido van Rossumd57fd912000-03-10 22:53:23 +000096/* Free list for Unicode objects */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +020097static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000100/* The empty Unicode object is shared to improve performance. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200117static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000123 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000124
125*/
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200126static char unicode_default_encoding[100 + 1] = "ascii";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes4d4f2702008-01-30 11:32:37 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000130 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000131/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000132/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000133/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000134/* case 0x000C: * FORM FEED */
135/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000138/* case 0x001C: * FILE SEPARATOR */
139/* case 0x001D: * GROUP SEPARATOR */
140/* case 0x001E: * RECORD SEPARATOR */
141/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000142 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000143/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000148
Benjamin Peterson857ce152009-01-31 16:29:18 +0000149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000176
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrou10042922010-01-13 14:01:26 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218#define BLOOM_MASK unsigned long
219
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221
Antoine Pitrou10042922010-01-13 14:01:26 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrou10042922010-01-13 14:01:26 +0000233 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000238 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239
240 return mask;
241}
242
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
Fredrik Lundh77633512006-05-23 19:47:35 +0000251 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000252}
253
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272
Benjamin Peterson857ce152009-01-31 16:29:18 +0000273 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000284 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000309 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Andrew Dalkee0df7622006-05-27 11:04:36 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitze7d8be82008-07-31 17:17:14 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000376 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000383 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387static
Guido van Rossum9475a232001-10-05 20:51:39 +0000388void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000390 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000391 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000392 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000399 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000400 }
401 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 }
411}
412
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415{
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 PyErr_BadInternalCall();
421 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000423 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000425 PyErr_BadInternalCall();
426 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000432 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447}
448
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456{
457 PyUnicodeObject *unicode;
458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000463 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 }
Tim Petersced69f82003-09-16 20:30:58 +0000482
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490
491 return (PyObject *)unicode;
492}
493
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000497
Benjamin Peterson857ce152009-01-31 16:29:18 +0000498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000500 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000501 return NULL;
502 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000503
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000510 /* Optimization for empty strings */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +0200511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548}
549
Serhiy Storchakae822b032013-08-06 16:56:26 +0300550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579#endif
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson6b265f12009-03-18 16:07:26 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000654#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662#endif
663
664 return (PyObject *)unicode;
665}
666
Mark Dickinson6b265f12009-03-18 16:07:26 +0000667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000691}
692
693#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
694
695PyObject *
696PyUnicode_FromFormatV(const char *format, va_list vargs)
697{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000698 va_list count;
699 Py_ssize_t callcount = 0;
700 PyObject **callresults = NULL;
701 PyObject **callresult = NULL;
702 Py_ssize_t n = 0;
703 int width = 0;
704 int precision = 0;
705 int zeropad;
706 const char* f;
707 Py_UNICODE *s;
708 PyObject *string;
709 /* used by sprintf */
710 char buffer[21];
711 /* use abuffer instead of buffer, if we need more space
712 * (which can happen if there's a format specifier with width). */
713 char *abuffer = NULL;
714 char *realbuffer;
715 Py_ssize_t abuffersize = 0;
716 char fmt[60]; /* should be enough for %0width.precisionld */
717 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000718
719#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000720 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000721#else
722#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000723 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726#endif
727#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000728 /* step 1: count the number of %S/%R/%s format specifications
729 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
730 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000731 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000732 if (*f == '%') {
733 if (*(f+1)=='%')
734 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000735 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000736 ++callcount;
737 while (isdigit((unsigned)*f))
738 width = (width*10) + *f++ - '0';
739 while (*++f && *f != '%' && !isalpha((unsigned)*f))
740 ;
741 if (*f == 's')
742 ++callcount;
743 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000744 }
745 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000746 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000747 if (callcount) {
748 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
749 if (!callresults) {
750 PyErr_NoMemory();
751 return NULL;
752 }
753 callresult = callresults;
754 }
755 /* step 3: figure out how large a buffer we need */
756 for (f = format; *f; f++) {
757 if (*f == '%') {
758 const char* p = f;
759 width = 0;
760 while (isdigit((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !isalpha((unsigned)*f))
763 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000764
Benjamin Peterson857ce152009-01-31 16:29:18 +0000765 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
766 * they don't affect the amount of space we reserve.
767 */
768 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000769 (f[1] == 'd' || f[1] == 'u'))
770 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000771
Benjamin Peterson857ce152009-01-31 16:29:18 +0000772 switch (*f) {
773 case 'c':
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300774 {
775 int ordinal = va_arg(count, int);
776#ifdef Py_UNICODE_WIDE
777 if (ordinal < 0 || ordinal > 0x10ffff) {
778 PyErr_SetString(PyExc_OverflowError,
779 "%c arg not in range(0x110000) "
780 "(wide Python build)");
781 goto fail;
782 }
783#else
784 if (ordinal < 0 || ordinal > 0xffff) {
785 PyErr_SetString(PyExc_OverflowError,
786 "%c arg not in range(0x10000) "
787 "(narrow Python build)");
788 goto fail;
789 }
790#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000791 /* fall through... */
Serhiy Storchakaba908c72013-06-23 20:22:09 +0300792 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000793 case '%':
794 n++;
795 break;
796 case 'd': case 'u': case 'i': case 'x':
797 (void) va_arg(count, int);
798 /* 20 bytes is enough to hold a 64-bit
799 integer. Decimal takes the most space.
800 This isn't enough for octal.
801 If a width is specified we need more
802 (which we allocate later). */
803 if (width < 20)
804 width = 20;
805 n += width;
806 if (abuffersize < width)
807 abuffersize = width;
808 break;
809 case 's':
810 {
811 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000812 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000813 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
814 if (!str)
815 goto fail;
816 n += PyUnicode_GET_SIZE(str);
817 /* Remember the str and switch to the next slot */
818 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000819 break;
820 }
821 case 'U':
822 {
823 PyObject *obj = va_arg(count, PyObject *);
824 assert(obj && PyUnicode_Check(obj));
825 n += PyUnicode_GET_SIZE(obj);
826 break;
827 }
828 case 'V':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 const char *str = va_arg(count, const char *);
832 assert(obj || str);
833 assert(!obj || PyUnicode_Check(obj));
834 if (obj)
835 n += PyUnicode_GET_SIZE(obj);
836 else
837 n += strlen(str);
838 break;
839 }
840 case 'S':
841 {
842 PyObject *obj = va_arg(count, PyObject *);
843 PyObject *str;
844 assert(obj);
845 str = PyObject_Str(obj);
846 if (!str)
847 goto fail;
848 n += PyUnicode_GET_SIZE(str);
849 /* Remember the str and switch to the next slot */
850 *callresult++ = str;
851 break;
852 }
853 case 'R':
854 {
855 PyObject *obj = va_arg(count, PyObject *);
856 PyObject *repr;
857 assert(obj);
858 repr = PyObject_Repr(obj);
859 if (!repr)
860 goto fail;
861 n += PyUnicode_GET_SIZE(repr);
862 /* Remember the repr and switch to the next slot */
863 *callresult++ = repr;
864 break;
865 }
866 case 'p':
867 (void) va_arg(count, int);
868 /* maximum 64-bit pointer representation:
869 * 0xffffffffffffffff
870 * so 19 characters is enough.
871 * XXX I count 18 -- what's the extra for?
872 */
873 n += 19;
874 break;
875 default:
876 /* if we stumble upon an unknown
877 formatting code, copy the rest of
878 the format string to the output
879 string. (we cannot just skip the
880 code, since there's no way to know
881 what's in the argument list) */
882 n += strlen(p);
883 goto expand;
884 }
885 } else
886 n++;
887 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000888 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000889 if (abuffersize > 20) {
890 abuffer = PyObject_Malloc(abuffersize);
891 if (!abuffer) {
892 PyErr_NoMemory();
893 goto fail;
894 }
895 realbuffer = abuffer;
896 }
897 else
898 realbuffer = buffer;
899 /* step 4: fill the buffer */
900 /* Since we've analyzed how much space we need for the worst case,
901 we don't have to resize the string.
902 There can be no errors beyond this point. */
903 string = PyUnicode_FromUnicode(NULL, n);
904 if (!string)
905 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000906
Benjamin Peterson857ce152009-01-31 16:29:18 +0000907 s = PyUnicode_AS_UNICODE(string);
908 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000909
Benjamin Peterson857ce152009-01-31 16:29:18 +0000910 for (f = format; *f; f++) {
911 if (*f == '%') {
912 const char* p = f++;
913 int longflag = 0;
914 int size_tflag = 0;
915 zeropad = (*f == '0');
916 /* parse the width.precision part */
917 width = 0;
918 while (isdigit((unsigned)*f))
919 width = (width*10) + *f++ - '0';
920 precision = 0;
921 if (*f == '.') {
922 f++;
923 while (isdigit((unsigned)*f))
924 precision = (precision*10) + *f++ - '0';
925 }
926 /* handle the long flag, but only for %ld and %lu.
927 others can be added when necessary. */
928 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
929 longflag = 1;
930 ++f;
931 }
932 /* handle the size_t flag. */
933 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
934 size_tflag = 1;
935 ++f;
936 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000937
Benjamin Peterson857ce152009-01-31 16:29:18 +0000938 switch (*f) {
939 case 'c':
940 *s++ = va_arg(vargs, int);
941 break;
942 case 'd':
943 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
944 if (longflag)
945 sprintf(realbuffer, fmt, va_arg(vargs, long));
946 else if (size_tflag)
947 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
948 else
949 sprintf(realbuffer, fmt, va_arg(vargs, int));
950 appendstring(realbuffer);
951 break;
952 case 'u':
953 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
954 if (longflag)
955 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
956 else if (size_tflag)
957 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
958 else
959 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
960 appendstring(realbuffer);
961 break;
962 case 'i':
963 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
964 sprintf(realbuffer, fmt, va_arg(vargs, int));
965 appendstring(realbuffer);
966 break;
967 case 'x':
968 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
969 sprintf(realbuffer, fmt, va_arg(vargs, int));
970 appendstring(realbuffer);
971 break;
972 case 's':
973 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000974 /* unused, since we already have the result */
975 (void) va_arg(vargs, char *);
976 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
977 PyUnicode_GET_SIZE(*callresult));
978 s += PyUnicode_GET_SIZE(*callresult);
979 /* We're done with the unicode()/repr() => forget it */
980 Py_DECREF(*callresult);
981 /* switch to next unicode()/repr() result */
982 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000983 break;
984 }
985 case 'U':
986 {
987 PyObject *obj = va_arg(vargs, PyObject *);
988 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
989 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
990 s += size;
991 break;
992 }
993 case 'V':
994 {
995 PyObject *obj = va_arg(vargs, PyObject *);
996 const char *str = va_arg(vargs, const char *);
997 if (obj) {
998 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
999 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1000 s += size;
1001 } else {
1002 appendstring(str);
1003 }
1004 break;
1005 }
1006 case 'S':
1007 case 'R':
1008 {
1009 Py_UNICODE *ucopy;
1010 Py_ssize_t usize;
1011 Py_ssize_t upos;
1012 /* unused, since we already have the result */
1013 (void) va_arg(vargs, PyObject *);
1014 ucopy = PyUnicode_AS_UNICODE(*callresult);
1015 usize = PyUnicode_GET_SIZE(*callresult);
1016 for (upos = 0; upos<usize;)
1017 *s++ = ucopy[upos++];
1018 /* We're done with the unicode()/repr() => forget it */
1019 Py_DECREF(*callresult);
1020 /* switch to next unicode()/repr() result */
1021 ++callresult;
1022 break;
1023 }
1024 case 'p':
1025 sprintf(buffer, "%p", va_arg(vargs, void*));
1026 /* %p is ill-defined: ensure leading 0x. */
1027 if (buffer[1] == 'X')
1028 buffer[1] = 'x';
1029 else if (buffer[1] != 'x') {
1030 memmove(buffer+2, buffer, strlen(buffer)+1);
1031 buffer[0] = '0';
1032 buffer[1] = 'x';
1033 }
1034 appendstring(buffer);
1035 break;
1036 case '%':
1037 *s++ = '%';
1038 break;
1039 default:
1040 appendstring(p);
1041 goto end;
1042 }
1043 } else
1044 *s++ = *f;
1045 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001046
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001047 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001048 if (callresults)
1049 PyObject_Free(callresults);
1050 if (abuffer)
1051 PyObject_Free(abuffer);
1052 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1053 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001054 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001055 if (callresults) {
1056 PyObject **callresult2 = callresults;
1057 while (callresult2 < callresult) {
1058 Py_DECREF(*callresult2);
1059 ++callresult2;
1060 }
1061 PyObject_Free(callresults);
1062 }
1063 if (abuffer)
1064 PyObject_Free(abuffer);
1065 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001066}
1067
1068#undef appendstring
1069
1070PyObject *
1071PyUnicode_FromFormat(const char *format, ...)
1072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001073 PyObject* ret;
1074 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001075
1076#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001077 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001078#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001079 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001080#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001081 ret = PyUnicode_FromFormatV(format, vargs);
1082 va_end(vargs);
1083 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001084}
1085
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001087 wchar_t *w,
1088 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089{
1090 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyErr_BadInternalCall();
1092 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001094
1095 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099#ifdef HAVE_USABLE_WCHAR_T
1100 memcpy(w, unicode->str, size * sizeof(wchar_t));
1101#else
1102 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001103 register Py_UNICODE *u;
1104 register Py_ssize_t i;
1105 u = PyUnicode_AS_UNICODE(unicode);
1106 for (i = size; i > 0; i--)
1107 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109#endif
1110
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001111 if (size > PyUnicode_GET_SIZE(unicode))
1112 return PyUnicode_GET_SIZE(unicode);
1113 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115}
1116
1117#endif
1118
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001119PyObject *PyUnicode_FromOrdinal(int ordinal)
1120{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001121 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122
1123#ifdef Py_UNICODE_WIDE
1124 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 PyErr_SetString(PyExc_ValueError,
1126 "unichr() arg not in range(0x110000) "
1127 "(wide Python build)");
1128 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001129 }
1130#else
1131 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 PyErr_SetString(PyExc_ValueError,
1133 "unichr() arg not in range(0x10000) "
1134 "(narrow Python build)");
1135 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001136 }
1137#endif
1138
Hye-Shik Chang40574832004-04-06 07:24:51 +00001139 s[0] = (Py_UNICODE)ordinal;
1140 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141}
1142
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143PyObject *PyUnicode_FromObject(register PyObject *obj)
1144{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001146 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001147 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001148 Py_INCREF(obj);
1149 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
1151 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001152 /* For a Unicode subtype that's not a Unicode object,
1153 return a true Unicode object with the same data. */
1154 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1155 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001156 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001157 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1158}
1159
1160PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001161 const char *encoding,
1162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001163{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001165 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001169 PyErr_BadInternalCall();
1170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001173#if 0
1174 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001175 that no encodings is given and then redirect to
1176 PyObject_Unicode() which then applies the additional logic for
1177 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001178
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001179 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001180 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181
1182 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001183 if (PyUnicode_Check(obj)) {
1184 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001185 PyErr_SetString(PyExc_TypeError,
1186 "decoding Unicode is not supported");
1187 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001188 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 return PyObject_Unicode(obj);
1190 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001191#else
1192 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001193 PyErr_SetString(PyExc_TypeError,
1194 "decoding Unicode is not supported");
1195 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001196 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001197#endif
1198
1199 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001200 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001201 s = PyString_AS_STRING(obj);
1202 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001203 }
Christian Heimes3497f942008-05-26 12:29:14 +00001204 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001205 /* Python 2.x specific */
1206 PyErr_Format(PyExc_TypeError,
1207 "decoding bytearray is not supported");
1208 return NULL;
1209 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001210 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001211 /* Overwrite the error message with something more useful in
1212 case of a TypeError. */
1213 if (PyErr_ExceptionMatches(PyExc_TypeError))
1214 PyErr_Format(PyExc_TypeError,
1215 "coercing to Unicode: need string or buffer, "
1216 "%.80s found",
1217 Py_TYPE(obj)->tp_name);
1218 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001221 /* Convert to Unicode */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001222 if (len == 0)
1223 _Py_RETURN_UNICODE_EMPTY();
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001224
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001225 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001226 return v;
1227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001228 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230}
1231
1232PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001233 Py_ssize_t size,
1234 const char *encoding,
1235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236{
1237 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238
1239 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001240 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001241
1242 /* Shortcuts for common default encodings */
1243 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001245 else if (strcmp(encoding, "latin-1") == 0)
1246 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001247#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1248 else if (strcmp(encoding, "mbcs") == 0)
1249 return PyUnicode_DecodeMBCS(s, size, errors);
1250#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001251 else if (strcmp(encoding, "ascii") == 0)
1252 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253
1254 /* Decode via the codec registry */
1255 buffer = PyBuffer_FromMemory((void *)s, size);
1256 if (buffer == NULL)
1257 goto onError;
1258 unicode = PyCodec_Decode(buffer, encoding, errors);
1259 if (unicode == NULL)
1260 goto onError;
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001263 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001264 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 Py_DECREF(unicode);
1266 goto onError;
1267 }
1268 Py_DECREF(buffer);
1269 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001270
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001271 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 Py_XDECREF(buffer);
1273 return NULL;
1274}
1275
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1277 const char *encoding,
1278 const char *errors)
1279{
1280 PyObject *v;
1281
1282 if (!PyUnicode_Check(unicode)) {
1283 PyErr_BadArgument();
1284 goto onError;
1285 }
1286
1287 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001288 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001289
1290 /* Decode via the codec registry */
1291 v = PyCodec_Decode(unicode, encoding, errors);
1292 if (v == NULL)
1293 goto onError;
1294 return v;
1295
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001296 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001297 return NULL;
1298}
1299
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 Py_ssize_t size,
1302 const char *encoding,
1303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304{
1305 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 unicode = PyUnicode_FromUnicode(s, size);
1308 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1311 Py_DECREF(unicode);
1312 return v;
1313}
1314
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001315PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1316 const char *encoding,
1317 const char *errors)
1318{
1319 PyObject *v;
1320
1321 if (!PyUnicode_Check(unicode)) {
1322 PyErr_BadArgument();
1323 goto onError;
1324 }
1325
1326 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001327 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001328
1329 /* Encode via the codec registry */
1330 v = PyCodec_Encode(unicode, encoding, errors);
1331 if (v == NULL)
1332 goto onError;
1333 return v;
1334
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001335 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001336 return NULL;
1337}
1338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1340 const char *encoding,
1341 const char *errors)
1342{
1343 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
Fred Drakee4315f52000-05-09 19:53:39 +00001349
Tim Petersced69f82003-09-16 20:30:58 +00001350 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001351 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001352
1353 /* Shortcuts for common default encodings */
1354 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001355 if (strcmp(encoding, "utf-8") == 0)
1356 return PyUnicode_AsUTF8String(unicode);
1357 else if (strcmp(encoding, "latin-1") == 0)
1358 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001359#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001360 else if (strcmp(encoding, "mbcs") == 0)
1361 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001362#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 else if (strcmp(encoding, "ascii") == 0)
1364 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366
1367 /* Encode via the codec registry */
1368 v = PyCodec_Encode(unicode, encoding, errors);
1369 if (v == NULL)
1370 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001371 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001373 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001374 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 Py_DECREF(v);
1376 goto onError;
1377 }
1378 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001379
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 return NULL;
1382}
1383
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001384PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001386{
1387 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1388
1389 if (v)
1390 return v;
1391 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1392 if (v && errors == NULL)
1393 ((PyUnicodeObject *)unicode)->defenc = v;
1394 return v;
1395}
1396
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1398{
1399 if (!PyUnicode_Check(unicode)) {
1400 PyErr_BadArgument();
1401 goto onError;
1402 }
1403 return PyUnicode_AS_UNICODE(unicode);
1404
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001405 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 return NULL;
1407}
1408
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
1411 if (!PyUnicode_Check(unicode)) {
1412 PyErr_BadArgument();
1413 goto onError;
1414 }
1415 return PyUnicode_GET_SIZE(unicode);
1416
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418 return -1;
1419}
1420
Thomas Wouters78890102000-07-22 19:25:51 +00001421const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001422{
1423 return unicode_default_encoding;
1424}
1425
1426int PyUnicode_SetDefaultEncoding(const char *encoding)
1427{
1428 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001429
Fred Drakee4315f52000-05-09 19:53:39 +00001430 /* Make sure the encoding is valid. As side effect, this also
1431 loads the encoding into the codec registry cache. */
1432 v = _PyCodec_Lookup(encoding);
1433 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001435 Py_DECREF(v);
1436 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001437 encoding,
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02001438 sizeof(unicode_default_encoding) - 1);
Fred Drakee4315f52000-05-09 19:53:39 +00001439 return 0;
1440
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001442 return -1;
1443}
1444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445/* error handling callback helper:
1446 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001447 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 and adjust various state variables.
1449 return 0 on success, -1 on error
1450*/
1451
1452static
1453int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001454 const char *encoding, const char *reason,
1455 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1456 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1457 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460
1461 PyObject *restuple = NULL;
1462 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1464 Py_ssize_t requiredsize;
1465 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 int res = -1;
1469
1470 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001471 *errorHandler = PyCodec_LookupError(errors);
1472 if (*errorHandler == NULL)
1473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 }
1475
1476 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001477 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001478 encoding, input, insize, *startinpos, *endinpos, reason);
1479 if (*exceptionObject == NULL)
1480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 }
1482 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001483 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1484 goto onError;
1485 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1486 goto onError;
1487 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1488 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 }
1490
1491 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1492 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001493 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001495 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 }
1498 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001499 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001501 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001502 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001503 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1504 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001506
1507 /* need more space? (at least enough for what we
1508 have+the replacement+the rest of the string (starting
1509 at the new input position), so we won't have to check space
1510 when there are no errors in the rest of the string) */
1511 repptr = PyUnicode_AS_UNICODE(repunicode);
1512 repsize = PyUnicode_GET_SIZE(repunicode);
1513 requiredsize = *outpos + repsize + insize-newpos;
1514 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001515 if (requiredsize<2*outsize)
1516 requiredsize = 2*outsize;
1517 if (_PyUnicode_Resize(output, requiredsize) < 0)
1518 goto onError;
1519 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 }
1521 *endinpos = newpos;
1522 *inptr = input + newpos;
1523 Py_UNICODE_COPY(*outptr, repptr, repsize);
1524 *outptr += repsize;
1525 *outpos += repsize;
1526 /* we made it! */
1527 res = 0;
1528
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 Py_XDECREF(restuple);
1531 return res;
1532}
1533
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534/* --- UTF-7 Codec -------------------------------------------------------- */
1535
Antoine Pitrou653dece2009-05-04 18:32:32 +00001536/* See RFC2152 for details. We encode conservatively and decode liberally. */
1537
1538/* Three simple macros defining base-64. */
1539
1540/* Is c a base-64 character? */
1541
1542#define IS_BASE64(c) \
1543 (isalnum(c) || (c) == '+' || (c) == '/')
1544
1545/* given that c is a base-64 character, what is its base-64 value? */
1546
1547#define FROM_BASE64(c) \
1548 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1549 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1550 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1551 (c) == '+' ? 62 : 63)
1552
1553/* What is the base-64 character of the bottom 6 bits of n? */
1554
1555#define TO_BASE64(n) \
1556 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1557
1558/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1559 * decoded as itself. We are permissive on decoding; the only ASCII
1560 * byte not decoding to itself is the + which begins a base64
1561 * string. */
1562
1563#define DECODE_DIRECT(c) \
1564 ((c) <= 127 && (c) != '+')
1565
1566/* The UTF-7 encoder treats ASCII characters differently according to
1567 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1568 * the above). See RFC2152. This array identifies these different
1569 * sets:
1570 * 0 : "Set D"
1571 * alphanumeric and '(),-./:?
1572 * 1 : "Set O"
1573 * !"#$%&*;<=>@[]^_`{|}
1574 * 2 : "whitespace"
1575 * ht nl cr sp
1576 * 3 : special (must be base64 encoded)
1577 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1578 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579
Tim Petersced69f82003-09-16 20:30:58 +00001580static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001581char utf7_category[128] = {
1582/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1583 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1584/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1585 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1586/* sp ! " # $ % & ' ( ) * + , - . / */
1587 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1588/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1590/* @ A B C D E F G H I J K L M N O */
1591 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1592/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1594/* ` a b c d e f g h i j k l m n o */
1595 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1596/* p q r s t u v w x y z { | } ~ del */
1597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598};
1599
Antoine Pitrou653dece2009-05-04 18:32:32 +00001600/* ENCODE_DIRECT: this character should be encoded as itself. The
1601 * answer depends on whether we are encoding set O as itself, and also
1602 * on whether we are encoding whitespace as itself. RFC2152 makes it
1603 * clear that the answers to these questions vary between
1604 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001605
Antoine Pitrou653dece2009-05-04 18:32:32 +00001606#define ENCODE_DIRECT(c, directO, directWS) \
1607 ((c) < 128 && (c) > 0 && \
1608 ((utf7_category[(c)] == 0) || \
1609 (directWS && (utf7_category[(c)] == 2)) || \
1610 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001613 Py_ssize_t size,
1614 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001616 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1617}
1618
Antoine Pitrou653dece2009-05-04 18:32:32 +00001619/* The decoder. The only state we preserve is our read position,
1620 * i.e. how many characters we have consumed. So if we end in the
1621 * middle of a shift sequence we have to back off the read position
1622 * and the output to the beginning of the sequence, otherwise we lose
1623 * all the shift state (seen bits, number of bits seen, high
1624 * surrogate). */
1625
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001626PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001627 Py_ssize_t size,
1628 const char *errors,
1629 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001630{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001632 Py_ssize_t startinpos;
1633 Py_ssize_t endinpos;
1634 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 const char *e;
1636 PyUnicodeObject *unicode;
1637 Py_UNICODE *p;
1638 const char *errmsg = "";
1639 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001640 Py_UNICODE *shiftOutStart;
1641 unsigned int base64bits = 0;
1642 unsigned long base64buffer = 0;
1643 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646
1647 unicode = _PyUnicode_New(size);
1648 if (!unicode)
1649 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001650 if (size == 0) {
1651 if (consumed)
1652 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001654 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655
1656 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001657 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 e = s + size;
1659
1660 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001661 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662
Antoine Pitrou653dece2009-05-04 18:32:32 +00001663 if (inShift) { /* in a base-64 section */
1664 if (IS_BASE64(ch)) { /* consume a base-64 character */
1665 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1666 base64bits += 6;
1667 s++;
1668 if (base64bits >= 16) {
1669 /* we have enough bits for a UTF-16 value */
1670 Py_UNICODE outCh = (Py_UNICODE)
1671 (base64buffer >> (base64bits-16));
1672 base64bits -= 16;
1673 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1674 if (surrogate) {
1675 /* expecting a second surrogate */
1676 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1677#ifdef Py_UNICODE_WIDE
1678 *p++ = (((surrogate & 0x3FF)<<10)
1679 | (outCh & 0x3FF)) + 0x10000;
1680#else
1681 *p++ = surrogate;
1682 *p++ = outCh;
1683#endif
1684 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001685 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001686 }
1687 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001688 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001689 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 }
1691 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001692 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001693 /* first surrogate */
1694 surrogate = outCh;
1695 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001696 else {
1697 *p++ = outCh;
1698 }
1699 }
1700 }
1701 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 inShift = 0;
1703 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001704 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001705 *p++ = surrogate;
1706 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001708 if (base64bits > 0) { /* left-over bits */
1709 if (base64bits >= 6) {
1710 /* We've seen at least one base-64 character */
1711 errmsg = "partial character in shift sequence";
1712 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001714 else {
1715 /* Some bits remain; they should be zero */
1716 if (base64buffer != 0) {
1717 errmsg = "non-zero padding bits in shift sequence";
1718 goto utf7Error;
1719 }
1720 }
1721 }
1722 if (ch != '-') {
1723 /* '-' is absorbed; other terminating
1724 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 *p++ = ch;
1726 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001727 }
1728 }
1729 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001731 s++; /* consume '+' */
1732 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 s++;
1734 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001735 }
1736 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001738 shiftOutStart = p;
1739 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 }
1741 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 *p++ = ch;
1744 s++;
1745 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001746 else {
1747 startinpos = s-starts;
1748 s++;
1749 errmsg = "unexpected special character";
1750 goto utf7Error;
1751 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001752 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001753utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 outpos = p-PyUnicode_AS_UNICODE(unicode);
1755 endinpos = s-starts;
1756 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001757 errors, &errorHandler,
1758 "utf7", errmsg,
1759 starts, size, &startinpos, &endinpos, &exc, &s,
1760 &unicode, &outpos, &p))
1761 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001762 }
1763
Antoine Pitrou653dece2009-05-04 18:32:32 +00001764 /* end of string */
1765
1766 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1767 /* if we're in an inconsistent state, that's an error */
1768 if (surrogate ||
1769 (base64bits >= 6) ||
1770 (base64bits > 0 && base64buffer != 0)) {
1771 outpos = p-PyUnicode_AS_UNICODE(unicode);
1772 endinpos = size;
1773 if (unicode_decode_call_errorhandler(
1774 errors, &errorHandler,
1775 "utf7", "unterminated shift sequence",
1776 starts, size, &startinpos, &endinpos, &exc, &s,
1777 &unicode, &outpos, &p))
1778 goto onError;
1779 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001781
1782 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001783 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001784 if (inShift) {
1785 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001786 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001787 }
1788 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001789 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001791 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001793 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001794 goto onError;
1795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 Py_XDECREF(errorHandler);
1797 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798 return (PyObject *)unicode;
1799
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001800 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 Py_XDECREF(errorHandler);
1802 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 Py_DECREF(unicode);
1804 return NULL;
1805}
1806
1807
1808PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001809 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 int base64SetO,
1811 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001812 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813{
1814 PyObject *v;
1815 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001816 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001818 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001819 unsigned int base64bits = 0;
1820 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821 char * out;
1822 char * start;
1823
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001824 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001825 return PyErr_NoMemory();
1826
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001828 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001829
Antoine Pitrou653dece2009-05-04 18:32:32 +00001830 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001831 if (v == NULL)
1832 return NULL;
1833
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001834 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835 for (;i < size; ++i) {
1836 Py_UNICODE ch = s[i];
1837
Antoine Pitrou653dece2009-05-04 18:32:32 +00001838 if (inShift) {
1839 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1840 /* shifting out */
1841 if (base64bits) { /* output remaining bits */
1842 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1843 base64buffer = 0;
1844 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 }
1846 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001847 /* Characters not in the BASE64 set implicitly unshift the sequence
1848 so no '-' is required, except if the character is itself a '-' */
1849 if (IS_BASE64(ch) || ch == '-') {
1850 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001852 *out++ = (char) ch;
1853 }
1854 else {
1855 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001856 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001857 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001858 else { /* not in a shift sequence */
1859 if (ch == '+') {
1860 *out++ = '+';
1861 *out++ = '-';
1862 }
1863 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864 *out++ = (char) ch;
1865 }
1866 else {
1867 *out++ = '+';
1868 inShift = 1;
1869 goto encode_char;
1870 }
1871 }
1872 continue;
1873encode_char:
1874#ifdef Py_UNICODE_WIDE
1875 if (ch >= 0x10000) {
1876 /* code first surrogate */
1877 base64bits += 16;
1878 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1879 while (base64bits >= 6) {
1880 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1881 base64bits -= 6;
1882 }
1883 /* prepare second surrogate */
1884 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1885 }
1886#endif
1887 base64bits += 16;
1888 base64buffer = (base64buffer << 16) | ch;
1889 while (base64bits >= 6) {
1890 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1891 base64bits -= 6;
1892 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001893 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001894 if (base64bits)
1895 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1896 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001898
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001899 if (_PyString_Resize(&v, out - start))
1900 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901 return v;
1902}
1903
Antoine Pitrou653dece2009-05-04 18:32:32 +00001904#undef IS_BASE64
1905#undef FROM_BASE64
1906#undef TO_BASE64
1907#undef DECODE_DIRECT
1908#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910/* --- UTF-8 Codec -------------------------------------------------------- */
1911
Tim Petersced69f82003-09-16 20:30:58 +00001912static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001914 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1915 illegal prefix. See RFC 3629 for details */
1916 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1917 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001918 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1920 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1921 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1924 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1926 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001927 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1928 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1929 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1930 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1931 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932};
1933
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001935 Py_ssize_t size,
1936 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937{
Walter Dörwald69652032004-09-07 20:24:22 +00001938 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1939}
1940
1941PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001942 Py_ssize_t size,
1943 const char *errors,
1944 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001948 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001949 Py_ssize_t startinpos;
1950 Py_ssize_t endinpos;
1951 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 const char *e;
1953 PyUnicodeObject *unicode;
1954 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 PyObject *errorHandler = NULL;
1957 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958
1959 /* Note: size will always be longer than the resulting Unicode
1960 character count */
1961 unicode = _PyUnicode_New(size);
1962 if (!unicode)
1963 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001964 if (size == 0) {
1965 if (consumed)
1966 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969
1970 /* Unpack UTF-8 encoded data */
1971 p = unicode->str;
1972 e = s + size;
1973
1974 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001975 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
1977 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001978 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 s++;
1980 continue;
1981 }
1982
1983 n = utf8_code_length[ch];
1984
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001985 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001986 if (consumed)
1987 break;
1988 else {
1989 errmsg = "unexpected end of data";
1990 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001991 endinpos = startinpos+1;
1992 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1993 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001994 goto utf8Error;
1995 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
1998 switch (n) {
1999
2000 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002002 startinpos = s-starts;
2003 endinpos = startinpos+1;
2004 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005
2006 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002007 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 startinpos = s-starts;
2009 endinpos = startinpos+1;
2010 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011
2012 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002013 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00002014 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002016 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 goto utf8Error;
2018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002020 assert ((ch > 0x007F) && (ch <= 0x07FF));
2021 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 break;
2023
2024 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00002025 /* XXX: surrogates shouldn't be valid UTF-8!
2026 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2027 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2028 Uncomment the 2 lines below to make them invalid,
2029 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00002030 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002031 (s[2] & 0xc0) != 0x80 ||
2032 ((unsigned char)s[0] == 0xE0 &&
2033 (unsigned char)s[1] < 0xA0)/* ||
2034 ((unsigned char)s[0] == 0xED &&
2035 (unsigned char)s[1] > 0x9F)*/) {
2036 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002037 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002038 endinpos = startinpos + 1;
2039
2040 /* if s[1] first two bits are 1 and 0, then the invalid
2041 continuation byte is s[2], so increment endinpos by 1,
2042 if not, s[1] is invalid and endinpos doesn't need to
2043 be incremented. */
2044 if ((s[1] & 0xC0) == 0x80)
2045 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 goto utf8Error;
2047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002049 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2050 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002051 break;
2052
2053 case 4:
2054 if ((s[1] & 0xc0) != 0x80 ||
2055 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002056 (s[3] & 0xc0) != 0x80 ||
2057 ((unsigned char)s[0] == 0xF0 &&
2058 (unsigned char)s[1] < 0x90) ||
2059 ((unsigned char)s[0] == 0xF4 &&
2060 (unsigned char)s[1] > 0x8F)) {
2061 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002062 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002063 endinpos = startinpos + 1;
2064 if ((s[1] & 0xC0) == 0x80) {
2065 endinpos++;
2066 if ((s[2] & 0xC0) == 0x80)
2067 endinpos++;
2068 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002069 goto utf8Error;
2070 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002071 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002072 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2073 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2074
Fredrik Lundh8f455852001-06-27 18:59:43 +00002075#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002076 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002078 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002079
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002080 /* translate from 10000..10FFFF to 0..FFFF */
2081 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002082
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002083 /* high surrogate = top 10 bits added to D800 */
2084 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002085
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002086 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002087 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 }
2091 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002092 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002093
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002094 utf8Error:
2095 outpos = p-PyUnicode_AS_UNICODE(unicode);
2096 if (unicode_decode_call_errorhandler(
2097 errors, &errorHandler,
2098 "utf8", errmsg,
2099 starts, size, &startinpos, &endinpos, &exc, &s,
2100 &unicode, &outpos, &p))
2101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 }
Walter Dörwald69652032004-09-07 20:24:22 +00002103 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002104 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105
2106 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002107 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 goto onError;
2109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 Py_XDECREF(errorHandler);
2111 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 return (PyObject *)unicode;
2113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002114 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002115 Py_XDECREF(errorHandler);
2116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 Py_DECREF(unicode);
2118 return NULL;
2119}
2120
Tim Peters602f7402002-04-27 18:03:26 +00002121/* Allocation strategy: if the string is short, convert into a stack buffer
2122 and allocate exactly as much space needed at the end. Else allocate the
2123 maximum possible needed (4 result bytes per Unicode character), and return
2124 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002125*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002126PyObject *
2127PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002128 Py_ssize_t size,
2129 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130{
Tim Peters602f7402002-04-27 18:03:26 +00002131#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002132
Martin v. Löwis18e16552006-02-15 17:27:45 +00002133 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002134 PyObject *v; /* result string object */
2135 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002136 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002137 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002138 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002139
Tim Peters602f7402002-04-27 18:03:26 +00002140 assert(s != NULL);
2141 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142
Tim Peters602f7402002-04-27 18:03:26 +00002143 if (size <= MAX_SHORT_UNICHARS) {
2144 /* Write into the stack buffer; nallocated can't overflow.
2145 * At the end, we'll allocate exactly as much heap space as it
2146 * turns out we need.
2147 */
2148 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2149 v = NULL; /* will allocate after we're done */
2150 p = stackbuf;
2151 }
2152 else {
2153 /* Overallocate on the heap, and give the excess back at the end. */
2154 nallocated = size * 4;
2155 if (nallocated / 4 != size) /* overflow! */
2156 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002157 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002158 if (v == NULL)
2159 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002162
Tim Peters602f7402002-04-27 18:03:26 +00002163 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002164 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002165
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002166 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002167 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002169
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002171 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002172 *p++ = (char)(0xc0 | (ch >> 6));
2173 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002174 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002175 else {
Tim Peters602f7402002-04-27 18:03:26 +00002176 /* Encode UCS2 Unicode ordinals */
2177 if (ch < 0x10000) {
2178 /* Special case: check for high surrogate */
2179 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2180 Py_UCS4 ch2 = s[i];
2181 /* Check for low surrogate and combine the two to
2182 form a UCS4 value */
2183 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002184 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002185 i++;
2186 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002187 }
Tim Peters602f7402002-04-27 18:03:26 +00002188 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002189 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002190 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002191 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2192 *p++ = (char)(0x80 | (ch & 0x3f));
2193 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002194 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002195 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002196 /* Encode UCS4 Unicode ordinals */
2197 *p++ = (char)(0xf0 | (ch >> 18));
2198 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2199 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2200 *p++ = (char)(0x80 | (ch & 0x3f));
2201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002203
Tim Peters602f7402002-04-27 18:03:26 +00002204 if (v == NULL) {
2205 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002206 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002207 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002208 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002209 }
2210 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002211 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002212 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002213 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002214 if (_PyString_Resize(&v, nneeded))
2215 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002218
Tim Peters602f7402002-04-27 18:03:26 +00002219#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220}
2221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2223{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 if (!PyUnicode_Check(unicode)) {
2225 PyErr_BadArgument();
2226 return NULL;
2227 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002228 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002229 PyUnicode_GET_SIZE(unicode),
2230 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231}
2232
Walter Dörwald6e390802007-08-17 16:41:28 +00002233/* --- UTF-32 Codec ------------------------------------------------------- */
2234
2235PyObject *
2236PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002237 Py_ssize_t size,
2238 const char *errors,
2239 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002240{
2241 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2242}
2243
2244PyObject *
2245PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002246 Py_ssize_t size,
2247 const char *errors,
2248 int *byteorder,
2249 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002250{
2251 const char *starts = s;
2252 Py_ssize_t startinpos;
2253 Py_ssize_t endinpos;
2254 Py_ssize_t outpos;
2255 PyUnicodeObject *unicode;
2256 Py_UNICODE *p;
2257#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002258 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002259 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002260#else
2261 const int pairs = 0;
2262#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002263 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002264 int bo = 0; /* assume native ordering by default */
2265 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002266 /* Offsets from q for retrieving bytes in the right order. */
2267#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2268 int iorder[] = {0, 1, 2, 3};
2269#else
2270 int iorder[] = {3, 2, 1, 0};
2271#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002272 PyObject *errorHandler = NULL;
2273 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002274
Walter Dörwald6e390802007-08-17 16:41:28 +00002275 q = (unsigned char *)s;
2276 e = q + size;
2277
2278 if (byteorder)
2279 bo = *byteorder;
2280
2281 /* Check for BOM marks (U+FEFF) in the input and adjust current
2282 byte order setting accordingly. In native mode, the leading BOM
2283 mark is skipped, in all other modes, it is copied to the output
2284 stream as-is (giving a ZWNBSP character). */
2285 if (bo == 0) {
2286 if (size >= 4) {
2287 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002288 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002289#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002290 if (bom == 0x0000FEFF) {
2291 q += 4;
2292 bo = -1;
2293 }
2294 else if (bom == 0xFFFE0000) {
2295 q += 4;
2296 bo = 1;
2297 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002298#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002299 if (bom == 0x0000FEFF) {
2300 q += 4;
2301 bo = 1;
2302 }
2303 else if (bom == 0xFFFE0000) {
2304 q += 4;
2305 bo = -1;
2306 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002307#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002308 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002309 }
2310
2311 if (bo == -1) {
2312 /* force LE */
2313 iorder[0] = 0;
2314 iorder[1] = 1;
2315 iorder[2] = 2;
2316 iorder[3] = 3;
2317 }
2318 else if (bo == 1) {
2319 /* force BE */
2320 iorder[0] = 3;
2321 iorder[1] = 2;
2322 iorder[2] = 1;
2323 iorder[3] = 0;
2324 }
2325
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002326 /* On narrow builds we split characters outside the BMP into two
2327 codepoints => count how much extra space we need. */
2328#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002329 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002330 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2331 pairs++;
2332#endif
2333
2334 /* This might be one to much, because of a BOM */
2335 unicode = _PyUnicode_New((size+3)/4+pairs);
2336 if (!unicode)
2337 return NULL;
2338 if (size == 0)
2339 return (PyObject *)unicode;
2340
2341 /* Unpack UTF-32 encoded data */
2342 p = unicode->str;
2343
Walter Dörwald6e390802007-08-17 16:41:28 +00002344 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002345 Py_UCS4 ch;
2346 /* remaining bytes at the end? (size should be divisible by 4) */
2347 if (e-q<4) {
2348 if (consumed)
2349 break;
2350 errmsg = "truncated data";
2351 startinpos = ((const char *)q)-starts;
2352 endinpos = ((const char *)e)-starts;
2353 goto utf32Error;
2354 /* The remaining input chars are ignored if the callback
2355 chooses to skip the input */
2356 }
2357 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2358 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002359
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002360 if (ch >= 0x110000)
2361 {
2362 errmsg = "codepoint not in range(0x110000)";
2363 startinpos = ((const char *)q)-starts;
2364 endinpos = startinpos+4;
2365 goto utf32Error;
2366 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002367#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002368 if (ch >= 0x10000)
2369 {
2370 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2371 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2372 }
2373 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002374#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002375 *p++ = ch;
2376 q += 4;
2377 continue;
2378 utf32Error:
2379 outpos = p-PyUnicode_AS_UNICODE(unicode);
2380 if (unicode_decode_call_errorhandler(
2381 errors, &errorHandler,
2382 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002383 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002384 &unicode, &outpos, &p))
2385 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002386 }
2387
2388 if (byteorder)
2389 *byteorder = bo;
2390
2391 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002392 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002393
2394 /* Adjust length */
2395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2396 goto onError;
2397
2398 Py_XDECREF(errorHandler);
2399 Py_XDECREF(exc);
2400 return (PyObject *)unicode;
2401
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002403 Py_DECREF(unicode);
2404 Py_XDECREF(errorHandler);
2405 Py_XDECREF(exc);
2406 return NULL;
2407}
2408
2409PyObject *
2410PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002411 Py_ssize_t size,
2412 const char *errors,
2413 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002414{
2415 PyObject *v;
2416 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002417 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002418#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002419 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002420#else
2421 const int pairs = 0;
2422#endif
2423 /* Offsets from p for storing byte pairs in the right order. */
2424#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2425 int iorder[] = {0, 1, 2, 3};
2426#else
2427 int iorder[] = {3, 2, 1, 0};
2428#endif
2429
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002430#define STORECHAR(CH) \
2431 do { \
2432 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2433 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2434 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2435 p[iorder[0]] = (CH) & 0xff; \
2436 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002437 } while(0)
2438
2439 /* In narrow builds we can output surrogate pairs as one codepoint,
2440 so we need less space. */
2441#ifndef Py_UNICODE_WIDE
2442 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002443 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2444 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2445 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002446#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002447 nsize = (size - pairs + (byteorder == 0));
2448 bytesize = nsize * 4;
2449 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002450 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002451 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002452 if (v == NULL)
2453 return NULL;
2454
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002455 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002457 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002458 if (size == 0)
2459 return v;
2460
2461 if (byteorder == -1) {
2462 /* force LE */
2463 iorder[0] = 0;
2464 iorder[1] = 1;
2465 iorder[2] = 2;
2466 iorder[3] = 3;
2467 }
2468 else if (byteorder == 1) {
2469 /* force BE */
2470 iorder[0] = 3;
2471 iorder[1] = 2;
2472 iorder[2] = 1;
2473 iorder[3] = 0;
2474 }
2475
2476 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002477 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002478#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002479 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2480 Py_UCS4 ch2 = *s;
2481 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2482 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2483 s++;
2484 size--;
2485 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002486 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002487#endif
2488 STORECHAR(ch);
2489 }
2490 return v;
2491#undef STORECHAR
2492}
2493
2494PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002501 PyUnicode_GET_SIZE(unicode),
2502 NULL,
2503 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002504}
2505
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506/* --- UTF-16 Codec ------------------------------------------------------- */
2507
Tim Peters772747b2001-08-09 22:21:55 +00002508PyObject *
2509PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002510 Py_ssize_t size,
2511 const char *errors,
2512 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513{
Walter Dörwald69652032004-09-07 20:24:22 +00002514 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2515}
2516
2517PyObject *
2518PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002519 Py_ssize_t size,
2520 const char *errors,
2521 int *byteorder,
2522 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002525 Py_ssize_t startinpos;
2526 Py_ssize_t endinpos;
2527 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 PyUnicodeObject *unicode;
2529 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002530 const unsigned char *q, *e;
2531 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002532 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002533 /* Offsets from q for retrieving byte pairs in the right order. */
2534#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2535 int ihi = 1, ilo = 0;
2536#else
2537 int ihi = 0, ilo = 1;
2538#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 PyObject *errorHandler = NULL;
2540 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
2542 /* Note: size will always be longer than the resulting Unicode
2543 character count */
2544 unicode = _PyUnicode_New(size);
2545 if (!unicode)
2546 return NULL;
2547 if (size == 0)
2548 return (PyObject *)unicode;
2549
2550 /* Unpack UTF-16 encoded data */
2551 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002552 q = (unsigned char *)s;
2553 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554
2555 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002556 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002558 /* Check for BOM marks (U+FEFF) in the input and adjust current
2559 byte order setting accordingly. In native mode, the leading BOM
2560 mark is skipped, in all other modes, it is copied to the output
2561 stream as-is (giving a ZWNBSP character). */
2562 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002563 if (size >= 2) {
2564 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002565#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566 if (bom == 0xFEFF) {
2567 q += 2;
2568 bo = -1;
2569 }
2570 else if (bom == 0xFFFE) {
2571 q += 2;
2572 bo = 1;
2573 }
Tim Petersced69f82003-09-16 20:30:58 +00002574#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002575 if (bom == 0xFEFF) {
2576 q += 2;
2577 bo = 1;
2578 }
2579 else if (bom == 0xFFFE) {
2580 q += 2;
2581 bo = -1;
2582 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002583#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586
Tim Peters772747b2001-08-09 22:21:55 +00002587 if (bo == -1) {
2588 /* force LE */
2589 ihi = 1;
2590 ilo = 0;
2591 }
2592 else if (bo == 1) {
2593 /* force BE */
2594 ihi = 0;
2595 ilo = 1;
2596 }
2597
2598 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002599 Py_UNICODE ch;
2600 /* remaining bytes at the end? (size should be even) */
2601 if (e-q<2) {
2602 if (consumed)
2603 break;
2604 errmsg = "truncated data";
2605 startinpos = ((const char *)q)-starts;
2606 endinpos = ((const char *)e)-starts;
2607 goto utf16Error;
2608 /* The remaining input chars are ignored if the callback
2609 chooses to skip the input */
2610 }
2611 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612
Benjamin Peterson857ce152009-01-31 16:29:18 +00002613 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002614
2615 if (ch < 0xD800 || ch > 0xDFFF) {
2616 *p++ = ch;
2617 continue;
2618 }
2619
2620 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002621 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002622 q -= 2;
2623 if (consumed)
2624 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002625 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002626 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002627 endinpos = ((const char *)e)-starts;
2628 goto utf16Error;
2629 }
2630 if (0xD800 <= ch && ch <= 0xDBFF) {
2631 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2632 q += 2;
2633 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002634#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002635 *p++ = ch;
2636 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002637#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002638 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002639#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002640 continue;
2641 }
2642 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002643 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002644 startinpos = (((const char *)q)-4)-starts;
2645 endinpos = startinpos+2;
2646 goto utf16Error;
2647 }
2648
Benjamin Peterson857ce152009-01-31 16:29:18 +00002649 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002650 errmsg = "illegal encoding";
2651 startinpos = (((const char *)q)-2)-starts;
2652 endinpos = startinpos+2;
2653 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002654
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002655 utf16Error:
2656 outpos = p-PyUnicode_AS_UNICODE(unicode);
2657 if (unicode_decode_call_errorhandler(
2658 errors, &errorHandler,
2659 "utf16", errmsg,
2660 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2661 &unicode, &outpos, &p))
2662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 }
2664
2665 if (byteorder)
2666 *byteorder = bo;
2667
Walter Dörwald69652032004-09-07 20:24:22 +00002668 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002670
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002672 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 goto onError;
2674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 Py_XDECREF(errorHandler);
2676 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 return (PyObject *)unicode;
2678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 Py_XDECREF(errorHandler);
2682 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 return NULL;
2684}
2685
Tim Peters772747b2001-08-09 22:21:55 +00002686PyObject *
2687PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002688 Py_ssize_t size,
2689 const char *errors,
2690 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691{
2692 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002693 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002694 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002695#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002696 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002697#else
2698 const int pairs = 0;
2699#endif
Tim Peters772747b2001-08-09 22:21:55 +00002700 /* Offsets from p for storing byte pairs in the right order. */
2701#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2702 int ihi = 1, ilo = 0;
2703#else
2704 int ihi = 0, ilo = 1;
2705#endif
2706
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002707#define STORECHAR(CH) \
2708 do { \
2709 p[ihi] = ((CH) >> 8) & 0xff; \
2710 p[ilo] = (CH) & 0xff; \
2711 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002712 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002714#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002715 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002716 if (s[i] >= 0x10000)
2717 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002718#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002719 /* 2 * (size + pairs + (byteorder == 0)) */
2720 if (size > PY_SSIZE_T_MAX ||
2721 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002722 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002723 nsize = size + pairs + (byteorder == 0);
2724 bytesize = nsize * 2;
2725 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002726 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002727 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 if (v == NULL)
2729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002731 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002733 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002734 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002735 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002736
2737 if (byteorder == -1) {
2738 /* force LE */
2739 ihi = 1;
2740 ilo = 0;
2741 }
2742 else if (byteorder == 1) {
2743 /* force BE */
2744 ihi = 0;
2745 ilo = 1;
2746 }
2747
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002748 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002749 Py_UNICODE ch = *s++;
2750 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002751#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002752 if (ch >= 0x10000) {
2753 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2754 ch = 0xD800 | ((ch-0x10000) >> 10);
2755 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002756#endif
Tim Peters772747b2001-08-09 22:21:55 +00002757 STORECHAR(ch);
2758 if (ch2)
2759 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002762#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763}
2764
2765PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2766{
2767 if (!PyUnicode_Check(unicode)) {
2768 PyErr_BadArgument();
2769 return NULL;
2770 }
2771 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002772 PyUnicode_GET_SIZE(unicode),
2773 NULL,
2774 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775}
2776
2777/* --- Unicode Escape Codec ----------------------------------------------- */
2778
Fredrik Lundh06d12682001-01-24 07:59:11 +00002779static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002782 Py_ssize_t size,
2783 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002786 Py_ssize_t startinpos;
2787 Py_ssize_t endinpos;
2788 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 char* message;
2793 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002794 PyObject *errorHandler = NULL;
2795 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 /* Escaped strings will always be longer than the resulting
2798 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 length after conversion to the true value.
2800 (but if the error callback returns a long replacement string
2801 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 v = _PyUnicode_New(size);
2803 if (v == NULL)
2804 goto onError;
2805 if (size == 0)
2806 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 while (s < end) {
2812 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002813 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
2816 /* Non-escape characters are interpreted as Unicode ordinals */
2817 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 continue;
2820 }
2821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 /* \ - Escapes */
2824 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002825 c = *s++;
2826 if (s > end)
2827 c = '\0'; /* Invalid after \ */
2828 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002830 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 case '\n': break;
2832 case '\\': *p++ = '\\'; break;
2833 case '\'': *p++ = '\''; break;
2834 case '\"': *p++ = '\"'; break;
2835 case 'b': *p++ = '\b'; break;
2836 case 'f': *p++ = '\014'; break; /* FF */
2837 case 't': *p++ = '\t'; break;
2838 case 'n': *p++ = '\n'; break;
2839 case 'r': *p++ = '\r'; break;
2840 case 'v': *p++ = '\013'; break; /* VT */
2841 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2842
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002843 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 case '0': case '1': case '2': case '3':
2845 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002846 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002847 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002848 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002849 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002850 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002852 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 break;
2854
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002855 /* hex escapes */
2856 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002858 digits = 2;
2859 message = "truncated \\xXX escape";
2860 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002862 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864 digits = 4;
2865 message = "truncated \\uXXXX escape";
2866 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002868 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 digits = 8;
2871 message = "truncated \\UXXXXXXXX escape";
2872 hexescape:
2873 chr = 0;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002874 if (end - s < digits) {
2875 /* count only hex digits */
2876 for (; s < end; ++s) {
2877 c = (unsigned char)*s;
2878 if (!Py_ISXDIGIT(c))
2879 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002880 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002881 goto error;
2882 }
2883 for (; digits--; ++s) {
2884 c = (unsigned char)*s;
2885 if (!Py_ISXDIGIT(c))
2886 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002887 chr = (chr<<4) & ~0xF;
2888 if (c >= '0' && c <= '9')
2889 chr += c - '0';
2890 else if (c >= 'a' && c <= 'f')
2891 chr += 10 + c - 'a';
2892 else
2893 chr += 10 + c - 'A';
2894 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002895 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896 /* _decoding_error will have already written into the
2897 target buffer. */
2898 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002899 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002900 /* when we get here, chr is a 32-bit unicode character */
2901 if (chr <= 0xffff)
2902 /* UCS-2 character */
2903 *p++ = (Py_UNICODE) chr;
2904 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002905 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002906 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002907#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002908 *p++ = chr;
2909#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002910 chr -= 0x10000L;
2911 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002912 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002913#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002914 } else {
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002915 message = "illegal Unicode character";
2916 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002917 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 break;
2919
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002920 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002921 case 'N':
2922 message = "malformed \\N character escape";
2923 if (ucnhash_CAPI == NULL) {
2924 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002925 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 if (ucnhash_CAPI == NULL)
2927 goto ucnhashError;
2928 }
2929 if (*s == '{') {
2930 const char *start = s+1;
2931 /* look for the closing brace */
2932 while (*s != '}' && s < end)
2933 s++;
2934 if (s > start && s < end && *s == '}') {
2935 /* found a name. look it up in the unicode database */
2936 message = "unknown Unicode character name";
2937 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002938 if (s - start - 1 <= INT_MAX &&
2939 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 goto store;
2941 }
2942 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002943 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002944
2945 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002946 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 message = "\\ at end of string";
2948 s--;
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002949 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00002950 }
2951 else {
2952 *p++ = '\\';
2953 *p++ = (unsigned char)s[-1];
2954 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002955 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 }
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002957 continue;
2958
2959 error:
2960 endinpos = s-starts;
2961 outpos = p-PyUnicode_AS_UNICODE(v);
2962 if (unicode_decode_call_errorhandler(
2963 errors, &errorHandler,
2964 "unicodeescape", message,
2965 starts, size, &startinpos, &endinpos, &exc, &s,
2966 &v, &outpos, &p))
2967 goto onError;
2968 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002970 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002971 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002972 Py_XDECREF(errorHandler);
2973 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002975
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002976 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002977 PyErr_SetString(
2978 PyExc_UnicodeError,
2979 "\\N escapes not supported (can't load unicodedata module)"
2980 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002981 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 Py_XDECREF(errorHandler);
2983 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002984 return NULL;
2985
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002986 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988 Py_XDECREF(errorHandler);
2989 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 return NULL;
2991}
2992
2993/* Return a Unicode-Escape string version of the Unicode object.
2994
2995 If quotes is true, the string is enclosed in u"" or u'' quotes as
2996 appropriate.
2997
2998*/
2999
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00003000Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003001 Py_ssize_t size,
3002 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00003003{
3004 /* like wcschr, but doesn't stop at NULL characters */
3005
3006 while (size-- > 0) {
3007 if (*s == ch)
3008 return s;
3009 s++;
3010 }
3011
3012 return NULL;
3013}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003014
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015static
3016PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003017 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 int quotes)
3019{
3020 PyObject *repr;
3021 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003023 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003024#ifdef Py_UNICODE_WIDE
3025 const Py_ssize_t expandsize = 10;
3026#else
3027 const Py_ssize_t expandsize = 6;
3028#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
Neal Norwitz17753ec2006-08-21 22:21:19 +00003030 /* XXX(nnorwitz): rather than over-allocating, it would be
3031 better to choose a different scheme. Perhaps scan the
3032 first N-chars of the string and allocate based on that size.
3033 */
3034 /* Initial allocation is based on the longest-possible unichr
3035 escape.
3036
3037 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3038 unichr, so in this case it's the longest unichr escape. In
3039 narrow (UTF-16) builds this is five chars per source unichr
3040 since there are two unichrs in the surrogate pair, so in narrow
3041 (UTF-16) builds it's not the longest unichr escape.
3042
3043 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3044 so in the narrow (UTF-16) build case it's the longest unichr
3045 escape.
3046 */
3047
Neal Norwitze7d8be82008-07-31 17:17:14 +00003048 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003049 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003050
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003051 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003052 2
3053 + expandsize*size
3054 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 if (repr == NULL)
3056 return NULL;
3057
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003058 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059
3060 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003062 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 !findchar(s, size, '"')) ? '"' : '\'';
3064 }
3065 while (size-- > 0) {
3066 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003067
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003068 /* Escape quotes and backslashes */
3069 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003070 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 *p++ = '\\';
3072 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003073 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003074 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003075
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003076#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003077 /* Map 21-bit characters to '\U00xxxxxx' */
3078 else if (ch >= 0x10000) {
3079 *p++ = '\\';
3080 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003081 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3082 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3083 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3084 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3085 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3086 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3087 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003088 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003089 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003090 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003091#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003092 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3093 else if (ch >= 0xD800 && ch < 0xDC00) {
3094 Py_UNICODE ch2;
3095 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003096
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003097 ch2 = *s++;
3098 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003099 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003100 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3101 *p++ = '\\';
3102 *p++ = 'U';
3103 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3104 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3105 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3106 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3107 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3108 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3109 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3110 *p++ = hexdigit[ucs & 0x0000000F];
3111 continue;
3112 }
3113 /* Fall through: isolated surrogates are copied as-is */
3114 s--;
3115 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003116 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003117#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003120 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 *p++ = '\\';
3122 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003123 *p++ = hexdigit[(ch >> 12) & 0x000F];
3124 *p++ = hexdigit[(ch >> 8) & 0x000F];
3125 *p++ = hexdigit[(ch >> 4) & 0x000F];
3126 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003128
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003129 /* Map special whitespace to '\t', \n', '\r' */
3130 else if (ch == '\t') {
3131 *p++ = '\\';
3132 *p++ = 't';
3133 }
3134 else if (ch == '\n') {
3135 *p++ = '\\';
3136 *p++ = 'n';
3137 }
3138 else if (ch == '\r') {
3139 *p++ = '\\';
3140 *p++ = 'r';
3141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003143 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003144 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003146 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003147 *p++ = hexdigit[(ch >> 4) & 0x000F];
3148 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003149 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003150
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 /* Copy everything else as-is */
3152 else
3153 *p++ = (char) ch;
3154 }
3155 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003156 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157
3158 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003159 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 return repr;
3162}
3163
3164PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003165 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166{
3167 return unicodeescape_string(s, size, 0);
3168}
3169
3170PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3171{
3172 if (!PyUnicode_Check(unicode)) {
3173 PyErr_BadArgument();
3174 return NULL;
3175 }
3176 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003177 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178}
3179
3180/* --- Raw Unicode Escape Codec ------------------------------------------- */
3181
3182PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003183 Py_ssize_t size,
3184 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003187 Py_ssize_t startinpos;
3188 Py_ssize_t endinpos;
3189 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 const char *end;
3193 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 PyObject *errorHandler = NULL;
3195 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 /* Escaped strings will always be longer than the resulting
3198 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199 length after conversion to the true value. (But decoding error
3200 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 v = _PyUnicode_New(size);
3202 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003203 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003205 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 end = s + size;
3208 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003209 unsigned char c;
3210 Py_UCS4 x;
3211 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003212 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003214 /* Non-escape characters are interpreted as Unicode ordinals */
3215 if (*s != '\\') {
3216 *p++ = (unsigned char)*s++;
3217 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003218 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003219 startinpos = s-starts;
3220
3221 /* \u-escapes are only interpreted iff the number of leading
3222 backslashes if odd */
3223 bs = s;
3224 for (;s < end;) {
3225 if (*s != '\\')
3226 break;
3227 *p++ = (unsigned char)*s++;
3228 }
3229 if (((s - bs) & 1) == 0 ||
3230 s >= end ||
3231 (*s != 'u' && *s != 'U')) {
3232 continue;
3233 }
3234 p--;
3235 count = *s=='u' ? 4 : 8;
3236 s++;
3237
3238 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3239 outpos = p-PyUnicode_AS_UNICODE(v);
3240 for (x = 0, i = 0; i < count; ++i, ++s) {
3241 c = (unsigned char)*s;
3242 if (!isxdigit(c)) {
3243 endinpos = s-starts;
3244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "truncated \\uXXXX",
3247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
3250 goto nextByte;
3251 }
3252 x = (x<<4) & ~0xF;
3253 if (c >= '0' && c <= '9')
3254 x += c - '0';
3255 else if (c >= 'a' && c <= 'f')
3256 x += 10 + c - 'a';
3257 else
3258 x += 10 + c - 'A';
3259 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003260 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003261 /* UCS-2 character */
3262 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003263 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003264 /* UCS-4 character. Either store directly, or as
3265 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003266#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003267 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003268#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 x -= 0x10000L;
3270 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3271 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003272#endif
3273 } else {
3274 endinpos = s-starts;
3275 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276 if (unicode_decode_call_errorhandler(
3277 errors, &errorHandler,
3278 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003279 starts, size, &startinpos, &endinpos, &exc, &s,
3280 &v, &outpos, &p))
3281 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 nextByte:
3284 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 Py_XDECREF(errorHandler);
3289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003291
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 Py_XDECREF(errorHandler);
3295 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 return NULL;
3297}
3298
3299PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003300 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301{
3302 PyObject *repr;
3303 char *p;
3304 char *q;
3305
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003306 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003308 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003309#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003310 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003311#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003312
Neal Norwitze7d8be82008-07-31 17:17:14 +00003313 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003315
Neal Norwitze7d8be82008-07-31 17:17:14 +00003316 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 if (repr == NULL)
3318 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003319 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003320 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003322 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 while (size-- > 0) {
3324 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003326 /* Map 32-bit characters to '\Uxxxxxxxx' */
3327 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003328 *p++ = '\\';
3329 *p++ = 'U';
3330 *p++ = hexdigit[(ch >> 28) & 0xf];
3331 *p++ = hexdigit[(ch >> 24) & 0xf];
3332 *p++ = hexdigit[(ch >> 20) & 0xf];
3333 *p++ = hexdigit[(ch >> 16) & 0xf];
3334 *p++ = hexdigit[(ch >> 12) & 0xf];
3335 *p++ = hexdigit[(ch >> 8) & 0xf];
3336 *p++ = hexdigit[(ch >> 4) & 0xf];
3337 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003338 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003339 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003340#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003341 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3342 if (ch >= 0xD800 && ch < 0xDC00) {
3343 Py_UNICODE ch2;
3344 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003345
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003346 ch2 = *s++;
3347 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003348 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003349 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3350 *p++ = '\\';
3351 *p++ = 'U';
3352 *p++ = hexdigit[(ucs >> 28) & 0xf];
3353 *p++ = hexdigit[(ucs >> 24) & 0xf];
3354 *p++ = hexdigit[(ucs >> 20) & 0xf];
3355 *p++ = hexdigit[(ucs >> 16) & 0xf];
3356 *p++ = hexdigit[(ucs >> 12) & 0xf];
3357 *p++ = hexdigit[(ucs >> 8) & 0xf];
3358 *p++ = hexdigit[(ucs >> 4) & 0xf];
3359 *p++ = hexdigit[ucs & 0xf];
3360 continue;
3361 }
3362 /* Fall through: isolated surrogates are copied as-is */
3363 s--;
3364 size++;
3365 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003366#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003367 /* Map 16-bit characters to '\uxxxx' */
3368 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 *p++ = '\\';
3370 *p++ = 'u';
3371 *p++ = hexdigit[(ch >> 12) & 0xf];
3372 *p++ = hexdigit[(ch >> 8) & 0xf];
3373 *p++ = hexdigit[(ch >> 4) & 0xf];
3374 *p++ = hexdigit[ch & 15];
3375 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003376 /* Copy everything else as-is */
3377 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 *p++ = (char) ch;
3379 }
3380 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003381 if (_PyString_Resize(&repr, p - q))
3382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 return repr;
3384}
3385
3386PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3387{
3388 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 PyErr_BadArgument();
3390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 }
3392 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003393 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394}
3395
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396/* --- Unicode Internal Codec ------------------------------------------- */
3397
3398PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003399 Py_ssize_t size,
3400 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401{
3402 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003403 Py_ssize_t startinpos;
3404 Py_ssize_t endinpos;
3405 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 PyUnicodeObject *v;
3407 Py_UNICODE *p;
3408 const char *end;
3409 const char *reason;
3410 PyObject *errorHandler = NULL;
3411 PyObject *exc = NULL;
3412
Neal Norwitzd43069c2006-01-08 01:12:10 +00003413#ifdef Py_UNICODE_WIDE
3414 Py_UNICODE unimax = PyUnicode_GetMax();
3415#endif
3416
Armin Rigo7ccbca92006-10-04 12:17:45 +00003417 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003418 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3419 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003420 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003421 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003422 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423 p = PyUnicode_AS_UNICODE(v);
3424 end = s + size;
3425
3426 while (s < end) {
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003427 if (end-s < Py_UNICODE_SIZE) {
3428 endinpos = end-starts;
3429 reason = "truncated input";
3430 goto error;
3431 }
Neal Norwitz1004a532006-05-15 07:17:23 +00003432 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003433#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003434 /* We have to sanity check the raw data, otherwise doom looms for
3435 some malformed UCS-4 data. */
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003436 if (*p > unimax || *p < 0) {
3437 endinpos = s - starts + Py_UNICODE_SIZE;
3438 reason = "illegal code point (> 0x10FFFF)";
3439 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003440 }
Serhiy Storchakad5327d92013-02-07 16:23:11 +02003441#endif
3442 p++;
3443 s += Py_UNICODE_SIZE;
3444 continue;
3445
3446 error:
3447 startinpos = s - starts;
3448 outpos = p - PyUnicode_AS_UNICODE(v);
3449 if (unicode_decode_call_errorhandler(
3450 errors, &errorHandler,
3451 "unicode_internal", reason,
3452 starts, size, &startinpos, &endinpos, &exc, &s,
3453 &v, &outpos, &p)) {
3454 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003455 }
3456 }
3457
Martin v. Löwis412fb672006-04-13 06:34:32 +00003458 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003459 goto onError;
3460 Py_XDECREF(errorHandler);
3461 Py_XDECREF(exc);
3462 return (PyObject *)v;
3463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003465 Py_XDECREF(v);
3466 Py_XDECREF(errorHandler);
3467 Py_XDECREF(exc);
3468 return NULL;
3469}
3470
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471/* --- Latin-1 Codec ------------------------------------------------------ */
3472
3473PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 Py_ssize_t size,
3475 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476{
3477 PyUnicodeObject *v;
3478 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003479
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003481 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003482 Py_UNICODE r = *(unsigned char*)s;
3483 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003484 }
3485
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 v = _PyUnicode_New(size);
3487 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003490 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 p = PyUnicode_AS_UNICODE(v);
3492 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003493 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003495
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 Py_XDECREF(v);
3498 return NULL;
3499}
3500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501/* create or adjust a UnicodeEncodeError */
3502static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003503 const char *encoding,
3504 const Py_UNICODE *unicode, Py_ssize_t size,
3505 Py_ssize_t startpos, Py_ssize_t endpos,
3506 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003509 *exceptionObject = PyUnicodeEncodeError_Create(
3510 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 }
3512 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3514 goto onError;
3515 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3516 goto onError;
3517 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3518 goto onError;
3519 return;
3520 onError:
3521 Py_DECREF(*exceptionObject);
3522 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 }
3524}
3525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526/* raises a UnicodeEncodeError */
3527static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 const char *encoding,
3529 const Py_UNICODE *unicode, Py_ssize_t size,
3530 Py_ssize_t startpos, Py_ssize_t endpos,
3531 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532{
3533 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537}
3538
3539/* error handling callback helper:
3540 build arguments, call the callback and check the arguments,
3541 put the result into newpos and return the replacement string, which
3542 has to be freed by the caller */
3543static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 PyObject **errorHandler,
3545 const char *encoding, const char *reason,
3546 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3547 Py_ssize_t startpos, Py_ssize_t endpos,
3548 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551
3552 PyObject *restuple;
3553 PyObject *resunicode;
3554
3555 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003556 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 }
3560
3561 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003562 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565
3566 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003567 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003569 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003571 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003572 Py_DECREF(restuple);
3573 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 }
3575 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003576 &resunicode, newpos)) {
3577 Py_DECREF(restuple);
3578 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579 }
3580 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003581 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003582 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003583 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3584 Py_DECREF(restuple);
3585 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 Py_INCREF(resunicode);
3588 Py_DECREF(restuple);
3589 return resunicode;
3590}
3591
3592static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003593 Py_ssize_t size,
3594 const char *errors,
3595 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596{
3597 /* output object */
3598 PyObject *res;
3599 /* pointers to the beginning and end+1 of input */
3600 const Py_UNICODE *startp = p;
3601 const Py_UNICODE *endp = p + size;
3602 /* pointer to the beginning of the unencodable characters */
3603 /* const Py_UNICODE *badp = NULL; */
3604 /* pointer into the output */
3605 char *str;
3606 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003607 Py_ssize_t respos = 0;
3608 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003609 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3610 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 PyObject *errorHandler = NULL;
3612 PyObject *exc = NULL;
3613 /* the following variable is used for caching string comparisons
3614 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3615 int known_errorHandler = -1;
3616
3617 /* allocate enough for a simple encoding without
3618 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003619 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 if (res == NULL)
3621 goto onError;
3622 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003623 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003624 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 ressize = size;
3626
3627 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003628 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003630 /* can we encode this? */
3631 if (c<limit) {
3632 /* no overflow check, because we know that the space is enough */
3633 *str++ = (char)c;
3634 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003635 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003636 else {
3637 Py_ssize_t unicodepos = p-startp;
3638 Py_ssize_t requiredsize;
3639 PyObject *repunicode;
3640 Py_ssize_t repsize;
3641 Py_ssize_t newpos;
3642 Py_ssize_t respos;
3643 Py_UNICODE *uni2;
3644 /* startpos for collecting unencodable chars */
3645 const Py_UNICODE *collstart = p;
3646 const Py_UNICODE *collend = p;
3647 /* find all unecodable characters */
3648 while ((collend < endp) && ((*collend)>=limit))
3649 ++collend;
3650 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3651 if (known_errorHandler==-1) {
3652 if ((errors==NULL) || (!strcmp(errors, "strict")))
3653 known_errorHandler = 1;
3654 else if (!strcmp(errors, "replace"))
3655 known_errorHandler = 2;
3656 else if (!strcmp(errors, "ignore"))
3657 known_errorHandler = 3;
3658 else if (!strcmp(errors, "xmlcharrefreplace"))
3659 known_errorHandler = 4;
3660 else
3661 known_errorHandler = 0;
3662 }
3663 switch (known_errorHandler) {
3664 case 1: /* strict */
3665 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3666 goto onError;
3667 case 2: /* replace */
3668 while (collstart++<collend)
3669 *str++ = '?'; /* fall through */
3670 case 3: /* ignore */
3671 p = collend;
3672 break;
3673 case 4: /* xmlcharrefreplace */
3674 respos = str-PyString_AS_STRING(res);
3675 /* determine replacement size (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003676 for (p = collstart, repsize = 0; p < collend;) {
3677 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3678 if (ch < 10)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003679 repsize += 2+1+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003680 else if (ch < 100)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003681 repsize += 2+2+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003682 else if (ch < 1000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003683 repsize += 2+3+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003684 else if (ch < 10000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003685 repsize += 2+4+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003686 else if (ch < 100000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003687 repsize += 2+5+1;
Serhiy Storchakae822b032013-08-06 16:56:26 +03003688 else if (ch < 1000000)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003689 repsize += 2+6+1;
3690 else
3691 repsize += 2+7+1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003692 }
3693 requiredsize = respos+repsize+(endp-collend);
3694 if (requiredsize > ressize) {
3695 if (requiredsize<2*ressize)
3696 requiredsize = 2*ressize;
3697 if (_PyString_Resize(&res, requiredsize))
3698 goto onError;
3699 str = PyString_AS_STRING(res) + respos;
3700 ressize = requiredsize;
3701 }
3702 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03003703 for (p = collstart; p < collend;) {
3704 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3705 str += sprintf(str, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003706 }
3707 p = collend;
3708 break;
3709 default:
3710 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3711 encoding, reason, startp, size, &exc,
3712 collstart-startp, collend-startp, &newpos);
3713 if (repunicode == NULL)
3714 goto onError;
3715 /* need more space? (at least enough for what we have+the
3716 replacement+the rest of the string, so we won't have to
3717 check space for encodable characters) */
3718 respos = str-PyString_AS_STRING(res);
3719 repsize = PyUnicode_GET_SIZE(repunicode);
3720 requiredsize = respos+repsize+(endp-collend);
3721 if (requiredsize > ressize) {
3722 if (requiredsize<2*ressize)
3723 requiredsize = 2*ressize;
3724 if (_PyString_Resize(&res, requiredsize)) {
3725 Py_DECREF(repunicode);
3726 goto onError;
3727 }
3728 str = PyString_AS_STRING(res) + respos;
3729 ressize = requiredsize;
3730 }
3731 /* check if there is anything unencodable in the replacement
3732 and copy it to the output */
3733 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3734 c = *uni2;
3735 if (c >= limit) {
3736 raise_encode_exception(&exc, encoding, startp, size,
3737 unicodepos, unicodepos+1, reason);
3738 Py_DECREF(repunicode);
3739 goto onError;
3740 }
3741 *str = (char)c;
3742 }
3743 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003744 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003745 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003746 }
3747 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003749 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 /* If this falls res will be NULL */
3752 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 Py_XDECREF(errorHandler);
3754 Py_XDECREF(exc);
3755 return res;
3756
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003757 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 Py_XDECREF(res);
3759 Py_XDECREF(errorHandler);
3760 Py_XDECREF(exc);
3761 return NULL;
3762}
3763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003765 Py_ssize_t size,
3766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769}
3770
3771PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3772{
3773 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003774 PyErr_BadArgument();
3775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 }
3777 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003778 PyUnicode_GET_SIZE(unicode),
3779 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780}
3781
3782/* --- 7-bit ASCII Codec -------------------------------------------------- */
3783
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 Py_ssize_t size,
3786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 PyUnicodeObject *v;
3790 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003791 Py_ssize_t startinpos;
3792 Py_ssize_t endinpos;
3793 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 const char *e;
3795 PyObject *errorHandler = NULL;
3796 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003797
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003799 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003800 Py_UNICODE r = *(unsigned char*)s;
3801 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003802 }
Tim Petersced69f82003-09-16 20:30:58 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 v = _PyUnicode_New(size);
3805 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003808 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 e = s + size;
3811 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003812 register unsigned char c = (unsigned char)*s;
3813 if (c < 128) {
3814 *p++ = c;
3815 ++s;
3816 }
3817 else {
3818 startinpos = s-starts;
3819 endinpos = startinpos + 1;
3820 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3821 if (unicode_decode_call_errorhandler(
3822 errors, &errorHandler,
3823 "ascii", "ordinal not in range(128)",
3824 starts, size, &startinpos, &endinpos, &exc, &s,
3825 &v, &outpos, &p))
3826 goto onError;
3827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003829 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3831 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_XDECREF(errorHandler);
3833 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003835
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003836 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 Py_XDECREF(errorHandler);
3839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 return NULL;
3841}
3842
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003844 Py_ssize_t size,
3845 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848}
3849
3850PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3851{
3852 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 PyErr_BadArgument();
3854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 }
3856 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003857 PyUnicode_GET_SIZE(unicode),
3858 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859}
3860
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003861#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003862
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003863/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003864
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003865#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866#define NEED_RETRY
3867#endif
3868
3869/* XXX This code is limited to "true" double-byte encodings, as
3870 a) it assumes an incomplete character consists of a single byte, and
3871 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003872 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003873
3874static int is_dbcs_lead_byte(const char *s, int offset)
3875{
3876 const char *curr = s + offset;
3877
3878 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003879 const char *prev = CharPrev(s, curr);
3880 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003881 }
3882 return 0;
3883}
3884
3885/*
3886 * Decode MBCS string into unicode object. If 'final' is set, converts
3887 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3888 */
3889static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003890 const char *s, /* MBCS string */
3891 int size, /* sizeof MBCS string */
3892 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893{
3894 Py_UNICODE *p;
3895 Py_ssize_t n = 0;
3896 int usize = 0;
3897
3898 assert(size >= 0);
3899
3900 /* Skip trailing lead-byte unless 'final' is set */
3901 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903
3904 /* First get the size of the result */
3905 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003906 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3907 if (usize == 0) {
3908 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3909 return -1;
3910 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003911 }
3912
3913 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003914 /* Create unicode object */
3915 *v = _PyUnicode_New(usize);
3916 if (*v == NULL)
3917 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918 }
3919 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 /* Extend unicode object */
3921 n = PyUnicode_GET_SIZE(*v);
3922 if (_PyUnicode_Resize(v, n + usize) < 0)
3923 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924 }
3925
3926 /* Do the conversion */
3927 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 p = PyUnicode_AS_UNICODE(*v) + n;
3929 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3930 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3931 return -1;
3932 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933 }
3934
3935 return size;
3936}
3937
3938PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 Py_ssize_t size,
3940 const char *errors,
3941 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003942{
3943 PyUnicodeObject *v = NULL;
3944 int done;
3945
3946 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003947 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003948
3949#ifdef NEED_RETRY
3950 retry:
3951 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003952 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003953 else
3954#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003955 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003956
3957 if (done < 0) {
3958 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003959 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003960 }
3961
3962 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003963 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964
3965#ifdef NEED_RETRY
3966 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003967 s += done;
3968 size -= done;
3969 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003970 }
3971#endif
3972
3973 return (PyObject *)v;
3974}
3975
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003977 Py_ssize_t size,
3978 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003979{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003980 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3981}
3982
3983/*
3984 * Convert unicode into string object (MBCS).
3985 * Returns 0 if succeed, -1 otherwise.
3986 */
3987static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003988 const Py_UNICODE *p, /* unicode */
3989 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990{
3991 int mbcssize = 0;
3992 Py_ssize_t n = 0;
3993
3994 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003995
3996 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003998 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3999 if (mbcssize == 0) {
4000 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4001 return -1;
4002 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004003 }
4004
Martin v. Löwisd8251432006-06-14 05:21:04 +00004005 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004006 /* Create string object */
4007 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4008 if (*repr == NULL)
4009 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 }
4011 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 /* Extend string object */
4013 n = PyString_Size(*repr);
4014 if (_PyString_Resize(repr, n + mbcssize) < 0)
4015 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 }
4017
4018 /* Do the conversion */
4019 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 char *s = PyString_AS_STRING(*repr) + n;
4021 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4022 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4023 return -1;
4024 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004025 }
4026
4027 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004028}
4029
4030PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004031 Py_ssize_t size,
4032 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004033{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004034 PyObject *repr = NULL;
4035 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004036
Martin v. Löwisd8251432006-06-14 05:21:04 +00004037#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004038 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004039 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004040 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004041 else
4042#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004043 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004044
Martin v. Löwisd8251432006-06-14 05:21:04 +00004045 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004046 Py_XDECREF(repr);
4047 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004048 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004049
4050#ifdef NEED_RETRY
4051 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004052 p += INT_MAX;
4053 size -= INT_MAX;
4054 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004055 }
4056#endif
4057
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004058 return repr;
4059}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004060
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004061PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4062{
4063 if (!PyUnicode_Check(unicode)) {
4064 PyErr_BadArgument();
4065 return NULL;
4066 }
4067 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004068 PyUnicode_GET_SIZE(unicode),
4069 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004070}
4071
Martin v. Löwisd8251432006-06-14 05:21:04 +00004072#undef NEED_RETRY
4073
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004074#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004075
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076/* --- Character Mapping Codec -------------------------------------------- */
4077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 Py_ssize_t size,
4080 PyObject *mapping,
4081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004084 Py_ssize_t startinpos;
4085 Py_ssize_t endinpos;
4086 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 PyUnicodeObject *v;
4089 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004090 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 PyObject *errorHandler = NULL;
4092 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004093 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004094 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004095
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 /* Default to Latin-1 */
4097 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099
4100 v = _PyUnicode_New(size);
4101 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004104 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004107 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004108 mapstring = PyUnicode_AS_UNICODE(mapping);
4109 maplen = PyUnicode_GET_SIZE(mapping);
4110 while (s < e) {
4111 unsigned char ch = *s;
4112 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004114 if (ch < maplen)
4115 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 if (x == 0xfffe) {
4118 /* undefined mapping */
4119 outpos = p-PyUnicode_AS_UNICODE(v);
4120 startinpos = s-starts;
4121 endinpos = startinpos+1;
4122 if (unicode_decode_call_errorhandler(
4123 errors, &errorHandler,
4124 "charmap", "character maps to <undefined>",
4125 starts, size, &startinpos, &endinpos, &exc, &s,
4126 &v, &outpos, &p)) {
4127 goto onError;
4128 }
4129 continue;
4130 }
4131 *p++ = x;
4132 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004133 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004134 }
4135 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004136 while (s < e) {
4137 unsigned char ch = *s;
4138 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004139
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004140 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4141 w = PyInt_FromLong((long)ch);
4142 if (w == NULL)
4143 goto onError;
4144 x = PyObject_GetItem(mapping, w);
4145 Py_DECREF(w);
4146 if (x == NULL) {
4147 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4148 /* No mapping found means: mapping is undefined. */
4149 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004150 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004151 } else
4152 goto onError;
4153 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004154
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004155 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004156 if (x == Py_None)
4157 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004158 if (PyInt_Check(x)) {
4159 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004160 if (value == 0xFFFE)
4161 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004162 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004163 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004164 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004165 Py_DECREF(x);
4166 goto onError;
4167 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004168
4169#ifndef Py_UNICODE_WIDE
4170 if (value > 0xFFFF) {
4171 /* see the code for 1-n mapping below */
4172 if (extrachars < 2) {
4173 /* resize first */
4174 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4175 Py_ssize_t needed = 10 - extrachars;
4176 extrachars += needed;
4177 /* XXX overflow detection missing */
4178 if (_PyUnicode_Resize(&v,
4179 PyUnicode_GET_SIZE(v) + needed) < 0) {
4180 Py_DECREF(x);
4181 goto onError;
4182 }
4183 p = PyUnicode_AS_UNICODE(v) + oldpos;
4184 }
4185 value -= 0x10000;
4186 *p++ = 0xD800 | (value >> 10);
4187 *p++ = 0xDC00 | (value & 0x3FF);
4188 extrachars -= 2;
4189 }
4190 else
4191#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004192 *p++ = (Py_UNICODE)value;
4193 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004194 else if (PyUnicode_Check(x)) {
4195 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004196
Serhiy Storchaka95997452013-01-15 14:42:59 +02004197 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004198 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004199 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4200 if (value == 0xFFFE)
4201 goto Undefined;
4202 *p++ = value;
4203 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004204 else if (targetsize > 1) {
4205 /* 1-n mapping */
4206 if (targetsize > extrachars) {
4207 /* resize first */
4208 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4209 Py_ssize_t needed = (targetsize - extrachars) + \
4210 (targetsize << 2);
4211 extrachars += needed;
4212 /* XXX overflow detection missing */
4213 if (_PyUnicode_Resize(&v,
4214 PyUnicode_GET_SIZE(v) + needed) < 0) {
4215 Py_DECREF(x);
4216 goto onError;
4217 }
4218 p = PyUnicode_AS_UNICODE(v) + oldpos;
4219 }
4220 Py_UNICODE_COPY(p,
4221 PyUnicode_AS_UNICODE(x),
4222 targetsize);
4223 p += targetsize;
4224 extrachars -= targetsize;
4225 }
4226 /* 1-0 mapping: skip the character */
4227 }
4228 else {
4229 /* wrong return value */
4230 PyErr_SetString(PyExc_TypeError,
4231 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004232 Py_DECREF(x);
4233 goto onError;
4234 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 Py_DECREF(x);
4236 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004237 continue;
4238Undefined:
4239 /* undefined mapping */
4240 Py_XDECREF(x);
4241 outpos = p-PyUnicode_AS_UNICODE(v);
4242 startinpos = s-starts;
4243 endinpos = startinpos+1;
4244 if (unicode_decode_call_errorhandler(
4245 errors, &errorHandler,
4246 "charmap", "character maps to <undefined>",
4247 starts, size, &startinpos, &endinpos, &exc, &s,
4248 &v, &outpos, &p)) {
4249 goto onError;
4250 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252 }
4253 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 Py_XDECREF(errorHandler);
4257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004260 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 Py_XDECREF(errorHandler);
4262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 Py_XDECREF(v);
4264 return NULL;
4265}
4266
Martin v. Löwis3f767792006-06-04 19:36:28 +00004267/* Charmap encoding: the lookup table */
4268
4269struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004270 PyObject_HEAD
4271 unsigned char level1[32];
4272 int count2, count3;
4273 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004274};
4275
4276static PyObject*
4277encoding_map_size(PyObject *obj, PyObject* args)
4278{
4279 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004280 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004281 128*map->count3);
4282}
4283
4284static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004285 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004286 PyDoc_STR("Return the size (in bytes) of this object") },
4287 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004288};
4289
4290static void
4291encoding_map_dealloc(PyObject* o)
4292{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004293 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004294}
4295
4296static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004297 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004298 "EncodingMap", /*tp_name*/
4299 sizeof(struct encoding_map), /*tp_basicsize*/
4300 0, /*tp_itemsize*/
4301 /* methods */
4302 encoding_map_dealloc, /*tp_dealloc*/
4303 0, /*tp_print*/
4304 0, /*tp_getattr*/
4305 0, /*tp_setattr*/
4306 0, /*tp_compare*/
4307 0, /*tp_repr*/
4308 0, /*tp_as_number*/
4309 0, /*tp_as_sequence*/
4310 0, /*tp_as_mapping*/
4311 0, /*tp_hash*/
4312 0, /*tp_call*/
4313 0, /*tp_str*/
4314 0, /*tp_getattro*/
4315 0, /*tp_setattro*/
4316 0, /*tp_as_buffer*/
4317 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4318 0, /*tp_doc*/
4319 0, /*tp_traverse*/
4320 0, /*tp_clear*/
4321 0, /*tp_richcompare*/
4322 0, /*tp_weaklistoffset*/
4323 0, /*tp_iter*/
4324 0, /*tp_iternext*/
4325 encoding_map_methods, /*tp_methods*/
4326 0, /*tp_members*/
4327 0, /*tp_getset*/
4328 0, /*tp_base*/
4329 0, /*tp_dict*/
4330 0, /*tp_descr_get*/
4331 0, /*tp_descr_set*/
4332 0, /*tp_dictoffset*/
4333 0, /*tp_init*/
4334 0, /*tp_alloc*/
4335 0, /*tp_new*/
4336 0, /*tp_free*/
4337 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004338};
4339
4340PyObject*
4341PyUnicode_BuildEncodingMap(PyObject* string)
4342{
4343 Py_UNICODE *decode;
4344 PyObject *result;
4345 struct encoding_map *mresult;
4346 int i;
4347 int need_dict = 0;
4348 unsigned char level1[32];
4349 unsigned char level2[512];
4350 unsigned char *mlevel1, *mlevel2, *mlevel3;
4351 int count2 = 0, count3 = 0;
4352
4353 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4354 PyErr_BadArgument();
4355 return NULL;
4356 }
4357 decode = PyUnicode_AS_UNICODE(string);
4358 memset(level1, 0xFF, sizeof level1);
4359 memset(level2, 0xFF, sizeof level2);
4360
4361 /* If there isn't a one-to-one mapping of NULL to \0,
4362 or if there are non-BMP characters, we need to use
4363 a mapping dictionary. */
4364 if (decode[0] != 0)
4365 need_dict = 1;
4366 for (i = 1; i < 256; i++) {
4367 int l1, l2;
4368 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004369#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004370 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004371#endif
4372 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373 need_dict = 1;
4374 break;
4375 }
4376 if (decode[i] == 0xFFFE)
4377 /* unmapped character */
4378 continue;
4379 l1 = decode[i] >> 11;
4380 l2 = decode[i] >> 7;
4381 if (level1[l1] == 0xFF)
4382 level1[l1] = count2++;
4383 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004384 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 }
4386
4387 if (count2 >= 0xFF || count3 >= 0xFF)
4388 need_dict = 1;
4389
4390 if (need_dict) {
4391 PyObject *result = PyDict_New();
4392 PyObject *key, *value;
4393 if (!result)
4394 return NULL;
4395 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004396 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004397 key = PyInt_FromLong(decode[i]);
4398 value = PyInt_FromLong(i);
4399 if (!key || !value)
4400 goto failed1;
4401 if (PyDict_SetItem(result, key, value) == -1)
4402 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004403 Py_DECREF(key);
4404 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 }
4406 return result;
4407 failed1:
4408 Py_XDECREF(key);
4409 Py_XDECREF(value);
4410 Py_DECREF(result);
4411 return NULL;
4412 }
4413
4414 /* Create a three-level trie */
4415 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4416 16*count2 + 128*count3 - 1);
4417 if (!result)
4418 return PyErr_NoMemory();
4419 PyObject_Init(result, &EncodingMapType);
4420 mresult = (struct encoding_map*)result;
4421 mresult->count2 = count2;
4422 mresult->count3 = count3;
4423 mlevel1 = mresult->level1;
4424 mlevel2 = mresult->level23;
4425 mlevel3 = mresult->level23 + 16*count2;
4426 memcpy(mlevel1, level1, 32);
4427 memset(mlevel2, 0xFF, 16*count2);
4428 memset(mlevel3, 0, 128*count3);
4429 count3 = 0;
4430 for (i = 1; i < 256; i++) {
4431 int o1, o2, o3, i2, i3;
4432 if (decode[i] == 0xFFFE)
4433 /* unmapped character */
4434 continue;
4435 o1 = decode[i]>>11;
4436 o2 = (decode[i]>>7) & 0xF;
4437 i2 = 16*mlevel1[o1] + o2;
4438 if (mlevel2[i2] == 0xFF)
4439 mlevel2[i2] = count3++;
4440 o3 = decode[i] & 0x7F;
4441 i3 = 128*mlevel2[i2] + o3;
4442 mlevel3[i3] = i;
4443 }
4444 return result;
4445}
4446
4447static int
4448encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4449{
4450 struct encoding_map *map = (struct encoding_map*)mapping;
4451 int l1 = c>>11;
4452 int l2 = (c>>7) & 0xF;
4453 int l3 = c & 0x7F;
4454 int i;
4455
4456#ifdef Py_UNICODE_WIDE
4457 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004458 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004459 }
4460#endif
4461 if (c == 0)
4462 return 0;
4463 /* level 1*/
4464 i = map->level1[l1];
4465 if (i == 0xFF) {
4466 return -1;
4467 }
4468 /* level 2*/
4469 i = map->level23[16*i+l2];
4470 if (i == 0xFF) {
4471 return -1;
4472 }
4473 /* level 3 */
4474 i = map->level23[16*map->count2 + 128*i + l3];
4475 if (i == 0) {
4476 return -1;
4477 }
4478 return i;
4479}
4480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481/* Lookup the character ch in the mapping. If the character
4482 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004483 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 PyObject *w = PyInt_FromLong((long)c);
4487 PyObject *x;
4488
4489 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004490 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 x = PyObject_GetItem(mapping, w);
4492 Py_DECREF(w);
4493 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004494 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4495 /* No mapping found means: mapping is undefined. */
4496 PyErr_Clear();
4497 x = Py_None;
4498 Py_INCREF(x);
4499 return x;
4500 } else
4501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004503 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004504 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004506 long value = PyInt_AS_LONG(x);
4507 if (value < 0 || value > 255) {
4508 PyErr_SetString(PyExc_TypeError,
4509 "character mapping must be in range(256)");
4510 Py_DECREF(x);
4511 return NULL;
4512 }
4513 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004515 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004516 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 /* wrong return value */
4519 PyErr_SetString(PyExc_TypeError,
4520 "character mapping must return integer, None or str");
4521 Py_DECREF(x);
4522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 }
4524}
4525
Martin v. Löwis3f767792006-06-04 19:36:28 +00004526static int
4527charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4528{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004529 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4530 /* exponentially overallocate to minimize reallocations */
4531 if (requiredsize < 2*outsize)
4532 requiredsize = 2*outsize;
4533 if (_PyString_Resize(outobj, requiredsize)) {
4534 return 0;
4535 }
4536 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004537}
4538
Benjamin Peterson857ce152009-01-31 16:29:18 +00004539typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004540 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004541}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542/* lookup the character, put the result in the output string and adjust
4543 various state variables. Reallocate the output string if not enough
4544 space is available. Return a new reference to the object that
4545 was put in the output buffer, or Py_None, if the mapping was undefined
4546 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004547 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004549charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004550 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004552 PyObject *rep;
4553 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004554 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555
Christian Heimese93237d2007-12-19 02:37:44 +00004556 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004557 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004558 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004559 if (res == -1)
4560 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004561 if (outsize<requiredsize)
4562 if (!charmapencode_resize(outobj, outpos, requiredsize))
4563 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004564 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004565 outstart[(*outpos)++] = (char)res;
4566 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004567 }
4568
4569 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004571 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004573 Py_DECREF(rep);
4574 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004575 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004576 if (PyInt_Check(rep)) {
4577 Py_ssize_t requiredsize = *outpos+1;
4578 if (outsize<requiredsize)
4579 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4580 Py_DECREF(rep);
4581 return enc_EXCEPTION;
4582 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004583 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004584 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004585 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004586 else {
4587 const char *repchars = PyString_AS_STRING(rep);
4588 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4589 Py_ssize_t requiredsize = *outpos+repsize;
4590 if (outsize<requiredsize)
4591 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4592 Py_DECREF(rep);
4593 return enc_EXCEPTION;
4594 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004595 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004596 memcpy(outstart + *outpos, repchars, repsize);
4597 *outpos += repsize;
4598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 }
Georg Brandl9f167602006-06-04 21:46:16 +00004600 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004601 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602}
4603
4604/* handle an error in PyUnicode_EncodeCharmap
4605 Return 0 on success, -1 on error */
4606static
4607int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004608 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004610 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004611 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612{
4613 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004614 Py_ssize_t repsize;
4615 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 Py_UNICODE *uni2;
4617 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004618 Py_ssize_t collstartpos = *inpos;
4619 Py_ssize_t collendpos = *inpos+1;
4620 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 char *encoding = "charmap";
4622 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004623 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 /* find all unencodable characters */
4626 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004627 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004628 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004629 int res = encoding_map_lookup(p[collendpos], mapping);
4630 if (res != -1)
4631 break;
4632 ++collendpos;
4633 continue;
4634 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004635
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004636 rep = charmapencode_lookup(p[collendpos], mapping);
4637 if (rep==NULL)
4638 return -1;
4639 else if (rep!=Py_None) {
4640 Py_DECREF(rep);
4641 break;
4642 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004643 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004644 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 }
4646 /* cache callback name lookup
4647 * (if not done yet, i.e. it's the first error) */
4648 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004649 if ((errors==NULL) || (!strcmp(errors, "strict")))
4650 *known_errorHandler = 1;
4651 else if (!strcmp(errors, "replace"))
4652 *known_errorHandler = 2;
4653 else if (!strcmp(errors, "ignore"))
4654 *known_errorHandler = 3;
4655 else if (!strcmp(errors, "xmlcharrefreplace"))
4656 *known_errorHandler = 4;
4657 else
4658 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
4660 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004661 case 1: /* strict */
4662 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4663 return -1;
4664 case 2: /* replace */
4665 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004666 x = charmapencode_output('?', mapping, res, respos);
4667 if (x==enc_EXCEPTION) {
4668 return -1;
4669 }
4670 else if (x==enc_FAILED) {
4671 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4672 return -1;
4673 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004674 }
4675 /* fall through */
4676 case 3: /* ignore */
4677 *inpos = collendpos;
4678 break;
4679 case 4: /* xmlcharrefreplace */
Serhiy Storchakae822b032013-08-06 16:56:26 +03004680 /* generate replacement */
4681 for (collpos = collstartpos; collpos < collendpos;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004682 char buffer[2+29+1+1];
4683 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03004684 Py_UCS4 ch = p[collpos++];
4685#ifndef Py_UNICODE_WIDE
4686 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4687 (collpos < collendpos) &&
4688 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4689 ch = ((((ch & 0x03FF) << 10) |
4690 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4691 }
4692#endif
4693 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004694 for (cp = buffer; *cp; ++cp) {
4695 x = charmapencode_output(*cp, mapping, res, respos);
4696 if (x==enc_EXCEPTION)
4697 return -1;
4698 else if (x==enc_FAILED) {
4699 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4700 return -1;
4701 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004702 }
4703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004704 *inpos = collendpos;
4705 break;
4706 default:
4707 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004708 encoding, reason, p, size, exceptionObject,
4709 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004710 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004712 /* generate replacement */
4713 repsize = PyUnicode_GET_SIZE(repunicode);
4714 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004715 x = charmapencode_output(*uni2, mapping, res, respos);
4716 if (x==enc_EXCEPTION) {
4717 return -1;
4718 }
4719 else if (x==enc_FAILED) {
4720 Py_DECREF(repunicode);
4721 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4722 return -1;
4723 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004724 }
4725 *inpos = newpos;
4726 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 }
4728 return 0;
4729}
4730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004732 Py_ssize_t size,
4733 PyObject *mapping,
4734 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 /* output object */
4737 PyObject *res = NULL;
4738 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004739 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004741 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 PyObject *errorHandler = NULL;
4743 PyObject *exc = NULL;
4744 /* the following variable is used for caching string comparisons
4745 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4746 * 3=ignore, 4=xmlcharrefreplace */
4747 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
4749 /* Default to Latin-1 */
4750 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004751 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 /* allocate enough for a simple encoding without
4754 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004755 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 if (res == NULL)
4757 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004758 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004759 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 /* try to encode it */
4763 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4764 if (x==enc_EXCEPTION) /* error */
4765 goto onError;
4766 if (x==enc_FAILED) { /* unencodable character */
4767 if (charmap_encoding_error(p, size, &inpos, mapping,
4768 &exc,
4769 &known_errorHandler, &errorHandler, errors,
4770 &res, &respos)) {
4771 goto onError;
4772 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004773 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004774 else
4775 /* done with this character => adjust input position */
4776 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004780 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 if (_PyString_Resize(&res, respos))
4782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783 }
4784 Py_XDECREF(exc);
4785 Py_XDECREF(errorHandler);
4786 return res;
4787
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 Py_XDECREF(res);
4790 Py_XDECREF(exc);
4791 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 return NULL;
4793}
4794
4795PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004796 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
4798 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 PyErr_BadArgument();
4800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 }
4802 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004803 PyUnicode_GET_SIZE(unicode),
4804 mapping,
4805 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806}
4807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808/* create or adjust a UnicodeTranslateError */
4809static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 const Py_UNICODE *unicode, Py_ssize_t size,
4811 Py_ssize_t startpos, Py_ssize_t endpos,
4812 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004815 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004816 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 }
4818 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4820 goto onError;
4821 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4822 goto onError;
4823 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4824 goto onError;
4825 return;
4826 onError:
4827 Py_DECREF(*exceptionObject);
4828 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
4830}
4831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832/* raises a UnicodeTranslateError */
4833static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004834 const Py_UNICODE *unicode, Py_ssize_t size,
4835 Py_ssize_t startpos, Py_ssize_t endpos,
4836 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837{
4838 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004839 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004841 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842}
4843
4844/* error handling callback helper:
4845 build arguments, call the callback and check the arguments,
4846 put the result into newpos and return the replacement string, which
4847 has to be freed by the caller */
4848static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 PyObject **errorHandler,
4850 const char *reason,
4851 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4852 Py_ssize_t startpos, Py_ssize_t endpos,
4853 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004855 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856
Martin v. Löwis412fb672006-04-13 06:34:32 +00004857 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 PyObject *restuple;
4859 PyObject *resunicode;
4860
4861 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004862 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004864 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 }
4866
4867 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004868 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871
4872 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004873 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004875 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004877 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004878 Py_DECREF(restuple);
4879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 }
4881 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004882 &resunicode, &i_newpos)) {
4883 Py_DECREF(restuple);
4884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004886 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004887 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004888 else
4889 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004890 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4892 Py_DECREF(restuple);
4893 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004894 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 Py_INCREF(resunicode);
4896 Py_DECREF(restuple);
4897 return resunicode;
4898}
4899
4900/* Lookup the character ch in the mapping and put the result in result,
4901 which must be decrefed by the caller.
4902 Return 0 on success, -1 on error */
4903static
4904int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4905{
4906 PyObject *w = PyInt_FromLong((long)c);
4907 PyObject *x;
4908
4909 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004910 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 x = PyObject_GetItem(mapping, w);
4912 Py_DECREF(w);
4913 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4915 /* No mapping found means: use 1:1 mapping. */
4916 PyErr_Clear();
4917 *result = NULL;
4918 return 0;
4919 } else
4920 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 }
4922 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004923 *result = x;
4924 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 }
4926 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 long value = PyInt_AS_LONG(x);
4928 long max = PyUnicode_GetMax();
4929 if (value < 0 || value > max) {
4930 PyErr_Format(PyExc_TypeError,
4931 "character mapping must be in range(0x%lx)", max+1);
4932 Py_DECREF(x);
4933 return -1;
4934 }
4935 *result = x;
4936 return 0;
4937 }
4938 else if (PyUnicode_Check(x)) {
4939 *result = x;
4940 return 0;
4941 }
4942 else {
4943 /* wrong return value */
4944 PyErr_SetString(PyExc_TypeError,
4945 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004946 Py_DECREF(x);
4947 return -1;
4948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949}
4950/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 if not reallocate and adjust various state variables.
4952 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953static
Walter Dörwald4894c302003-10-24 14:25:28 +00004954int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004957 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004958 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004959 /* remember old output position */
4960 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4961 /* exponentially overallocate to minimize reallocations */
4962 if (requiredsize < 2 * oldsize)
4963 requiredsize = 2 * oldsize;
4964 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4965 return -1;
4966 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 }
4968 return 0;
4969}
4970/* lookup the character, put the result in the output string and adjust
4971 various state variables. Return a new reference to the object that
4972 was put in the output buffer in *result, or Py_None, if the mapping was
4973 undefined (in which case no character was written).
4974 The called must decref result.
4975 Return 0 on success, -1 on error. */
4976static
Walter Dörwald4894c302003-10-24 14:25:28 +00004977int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004978 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4979 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980{
Walter Dörwald4894c302003-10-24 14:25:28 +00004981 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 /* not found => default to 1:1 mapping */
4985 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 }
4987 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 /* no overflow check, because we know that the space is enough */
4991 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 }
4993 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004994 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4995 if (repsize==1) {
4996 /* no overflow check, because we know that the space is enough */
4997 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4998 }
4999 else if (repsize!=0) {
5000 /* more than one character */
5001 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5002 (insize - (curinp-startinp)) +
5003 repsize - 1;
5004 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5005 return -1;
5006 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5007 *outp += repsize;
5008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 }
5010 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 return 0;
5013}
5014
5015PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005016 Py_ssize_t size,
5017 PyObject *mapping,
5018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 /* output object */
5021 PyObject *res = NULL;
5022 /* pointers to the beginning and end+1 of input */
5023 const Py_UNICODE *startp = p;
5024 const Py_UNICODE *endp = p + size;
5025 /* pointer into the output */
5026 Py_UNICODE *str;
5027 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005028 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 char *reason = "character maps to <undefined>";
5030 PyObject *errorHandler = NULL;
5031 PyObject *exc = NULL;
5032 /* the following variable is used for caching string comparisons
5033 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5034 * 3=ignore, 4=xmlcharrefreplace */
5035 int known_errorHandler = -1;
5036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005038 PyErr_BadArgument();
5039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041
5042 /* allocate enough for a simple 1:1 translation without
5043 replacements, if we need more, we'll resize */
5044 res = PyUnicode_FromUnicode(NULL, size);
5045 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005048 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005052 /* try to encode it */
5053 PyObject *x = NULL;
5054 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5055 Py_XDECREF(x);
5056 goto onError;
5057 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005058 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005059 if (x!=Py_None) /* it worked => adjust input pointer */
5060 ++p;
5061 else { /* untranslatable character */
5062 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5063 Py_ssize_t repsize;
5064 Py_ssize_t newpos;
5065 Py_UNICODE *uni2;
5066 /* startpos for collecting untranslatable chars */
5067 const Py_UNICODE *collstart = p;
5068 const Py_UNICODE *collend = p+1;
5069 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005071 /* find all untranslatable characters */
5072 while (collend < endp) {
5073 if (charmaptranslate_lookup(*collend, mapping, &x))
5074 goto onError;
5075 Py_XDECREF(x);
5076 if (x!=Py_None)
5077 break;
5078 ++collend;
5079 }
5080 /* cache callback name lookup
5081 * (if not done yet, i.e. it's the first error) */
5082 if (known_errorHandler==-1) {
5083 if ((errors==NULL) || (!strcmp(errors, "strict")))
5084 known_errorHandler = 1;
5085 else if (!strcmp(errors, "replace"))
5086 known_errorHandler = 2;
5087 else if (!strcmp(errors, "ignore"))
5088 known_errorHandler = 3;
5089 else if (!strcmp(errors, "xmlcharrefreplace"))
5090 known_errorHandler = 4;
5091 else
5092 known_errorHandler = 0;
5093 }
5094 switch (known_errorHandler) {
5095 case 1: /* strict */
5096 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005097 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 case 2: /* replace */
5099 /* No need to check for space, this is a 1:1 replacement */
5100 for (coll = collstart; coll<collend; ++coll)
5101 *str++ = '?';
5102 /* fall through */
5103 case 3: /* ignore */
5104 p = collend;
5105 break;
5106 case 4: /* xmlcharrefreplace */
5107 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005108 for (p = collstart; p < collend;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 char buffer[2+29+1+1];
5110 char *cp;
Serhiy Storchakae822b032013-08-06 16:56:26 +03005111 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5112 sprintf(buffer, "&#%d;", (int)ch);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005113 if (charmaptranslate_makespace(&res, &str,
5114 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5115 goto onError;
5116 for (cp = buffer; *cp; ++cp)
5117 *str++ = *cp;
5118 }
5119 p = collend;
5120 break;
5121 default:
5122 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5123 reason, startp, size, &exc,
5124 collstart-startp, collend-startp, &newpos);
5125 if (repunicode == NULL)
5126 goto onError;
5127 /* generate replacement */
5128 repsize = PyUnicode_GET_SIZE(repunicode);
5129 if (charmaptranslate_makespace(&res, &str,
5130 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5131 Py_DECREF(repunicode);
5132 goto onError;
5133 }
5134 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5135 *str++ = *uni2;
5136 p = startp + newpos;
5137 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005138 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005139 }
5140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 /* Resize if we allocated to much */
5142 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005143 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005144 if (PyUnicode_Resize(&res, respos) < 0)
5145 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 }
5147 Py_XDECREF(exc);
5148 Py_XDECREF(errorHandler);
5149 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 Py_XDECREF(res);
5153 Py_XDECREF(exc);
5154 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 return NULL;
5156}
5157
5158PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005159 PyObject *mapping,
5160 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
5162 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 str = PyUnicode_FromObject(str);
5165 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005166 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005168 PyUnicode_GET_SIZE(str),
5169 mapping,
5170 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 Py_DECREF(str);
5172 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 Py_XDECREF(str);
5176 return NULL;
5177}
Tim Petersced69f82003-09-16 20:30:58 +00005178
Guido van Rossum9e896b32000-04-05 20:11:21 +00005179/* --- Decimal Encoder ---------------------------------------------------- */
5180
5181int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005182 Py_ssize_t length,
5183 char *output,
5184 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005185{
5186 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 PyObject *errorHandler = NULL;
5188 PyObject *exc = NULL;
5189 const char *encoding = "decimal";
5190 const char *reason = "invalid decimal Unicode string";
5191 /* the following variable is used for caching string comparisons
5192 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5193 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005194
5195 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005196 PyErr_BadArgument();
5197 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005198 }
5199
5200 p = s;
5201 end = s + length;
5202 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005203 register Py_UNICODE ch = *p;
5204 int decimal;
5205 PyObject *repunicode;
5206 Py_ssize_t repsize;
5207 Py_ssize_t newpos;
5208 Py_UNICODE *uni2;
5209 Py_UNICODE *collstart;
5210 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005212 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005213 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005214 ++p;
5215 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005216 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005217 decimal = Py_UNICODE_TODECIMAL(ch);
5218 if (decimal >= 0) {
5219 *output++ = '0' + decimal;
5220 ++p;
5221 continue;
5222 }
5223 if (0 < ch && ch < 256) {
5224 *output++ = (char)ch;
5225 ++p;
5226 continue;
5227 }
5228 /* All other characters are considered unencodable */
5229 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005230 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005231 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005232 Py_UNICODE_ISSPACE(*collend) ||
5233 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005234 break;
5235 }
5236 /* cache callback name lookup
5237 * (if not done yet, i.e. it's the first error) */
5238 if (known_errorHandler==-1) {
5239 if ((errors==NULL) || (!strcmp(errors, "strict")))
5240 known_errorHandler = 1;
5241 else if (!strcmp(errors, "replace"))
5242 known_errorHandler = 2;
5243 else if (!strcmp(errors, "ignore"))
5244 known_errorHandler = 3;
5245 else if (!strcmp(errors, "xmlcharrefreplace"))
5246 known_errorHandler = 4;
5247 else
5248 known_errorHandler = 0;
5249 }
5250 switch (known_errorHandler) {
5251 case 1: /* strict */
5252 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5253 goto onError;
5254 case 2: /* replace */
5255 for (p = collstart; p < collend; ++p)
5256 *output++ = '?';
5257 /* fall through */
5258 case 3: /* ignore */
5259 p = collend;
5260 break;
5261 case 4: /* xmlcharrefreplace */
5262 /* generate replacement (temporarily (mis)uses p) */
Serhiy Storchakae822b032013-08-06 16:56:26 +03005263 for (p = collstart; p < collend;) {
5264 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5265 output += sprintf(output, "&#%d;", ch);
5266 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005267 p = collend;
5268 break;
5269 default:
5270 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5271 encoding, reason, s, length, &exc,
5272 collstart-s, collend-s, &newpos);
5273 if (repunicode == NULL)
5274 goto onError;
5275 /* generate replacement */
5276 repsize = PyUnicode_GET_SIZE(repunicode);
5277 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5278 Py_UNICODE ch = *uni2;
5279 if (Py_UNICODE_ISSPACE(ch))
5280 *output++ = ' ';
5281 else {
5282 decimal = Py_UNICODE_TODECIMAL(ch);
5283 if (decimal >= 0)
5284 *output++ = '0' + decimal;
5285 else if (0 < ch && ch < 256)
5286 *output++ = (char)ch;
5287 else {
5288 Py_DECREF(repunicode);
5289 raise_encode_exception(&exc, encoding,
5290 s, length, collstart-s, collend-s, reason);
5291 goto onError;
5292 }
5293 }
5294 }
5295 p = s + newpos;
5296 Py_DECREF(repunicode);
5297 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005298 }
5299 /* 0-terminate the output string */
5300 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301 Py_XDECREF(exc);
5302 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005303 return 0;
5304
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005305 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005306 Py_XDECREF(exc);
5307 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005308 return -1;
5309}
5310
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311/* --- Helpers ------------------------------------------------------------ */
5312
Eric Smitha9f7d622008-02-17 19:46:49 +00005313#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005314#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005315
5316#include "stringlib/count.h"
5317#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005318#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005319#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005320
Fredrik Lundhc8162812006-05-26 19:33:03 +00005321/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005322#define ADJUST_INDICES(start, end, len) \
5323 if (end > len) \
5324 end = len; \
5325 else if (end < 0) { \
5326 end += len; \
5327 if (end < 0) \
5328 end = 0; \
5329 } \
5330 if (start < 0) { \
5331 start += len; \
5332 if (start < 0) \
5333 start = 0; \
5334 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005335
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005337 PyObject *substr,
5338 Py_ssize_t start,
5339 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005342 PyUnicodeObject* str_obj;
5343 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005345 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5346 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005347 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005348 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5349 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005350 Py_DECREF(str_obj);
5351 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 }
Tim Petersced69f82003-09-16 20:30:58 +00005353
Antoine Pitrou64672132010-01-13 07:55:48 +00005354 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005355 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005356 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5357 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005358 );
5359
5360 Py_DECREF(sub_obj);
5361 Py_DECREF(str_obj);
5362
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 return result;
5364}
5365
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005367 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005368 Py_ssize_t start,
5369 Py_ssize_t end,
5370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005374 str = PyUnicode_FromObject(str);
5375 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005377 sub = PyUnicode_FromObject(sub);
5378 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005379 Py_DECREF(str);
5380 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Tim Petersced69f82003-09-16 20:30:58 +00005382
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005383 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005384 result = stringlib_find_slice(
5385 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5386 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5387 start, end
5388 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005389 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005390 result = stringlib_rfind_slice(
5391 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5392 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5393 start, end
5394 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005395
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005396 Py_DECREF(str);
5397 Py_DECREF(sub);
5398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 return result;
5400}
5401
Tim Petersced69f82003-09-16 20:30:58 +00005402static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005404 PyUnicodeObject *substring,
5405 Py_ssize_t start,
5406 Py_ssize_t end,
5407 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (substring->length == 0)
5410 return 1;
5411
Antoine Pitrou64672132010-01-13 07:55:48 +00005412 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 end -= substring->length;
5414 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005415 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416
5417 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005418 if (Py_UNICODE_MATCH(self, end, substring))
5419 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 } else {
5421 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005422 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 }
5424
5425 return 0;
5426}
5427
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005429 PyObject *substr,
5430 Py_ssize_t start,
5431 Py_ssize_t end,
5432 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005434 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 str = PyUnicode_FromObject(str);
5437 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005438 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 substr = PyUnicode_FromObject(substr);
5440 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005441 Py_DECREF(str);
5442 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 }
Tim Petersced69f82003-09-16 20:30:58 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 (PyUnicodeObject *)substr,
5447 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 Py_DECREF(str);
5449 Py_DECREF(substr);
5450 return result;
5451}
5452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453/* Apply fixfct filter to the Unicode object self and return a
5454 reference to the modified object */
5455
Tim Petersced69f82003-09-16 20:30:58 +00005456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005458 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459{
5460
5461 PyUnicodeObject *u;
5462
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005463 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005465 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005466
5467 Py_UNICODE_COPY(u->str, self->str, self->length);
5468
Tim Peters7a29bd52001-09-12 03:03:31 +00005469 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005470 /* fixfct should return TRUE if it modified the buffer. If
5471 FALSE, return a reference to the original buffer instead
5472 (to save space, not time) */
5473 Py_INCREF(self);
5474 Py_DECREF(u);
5475 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 }
5477 return (PyObject*) u;
5478}
5479
Tim Petersced69f82003-09-16 20:30:58 +00005480static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481int fixupper(PyUnicodeObject *self)
5482{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005483 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 Py_UNICODE *s = self->str;
5485 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005486
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005488 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005489
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005490 ch = Py_UNICODE_TOUPPER(*s);
5491 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005493 *s = ch;
5494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 s++;
5496 }
5497
5498 return status;
5499}
5500
Tim Petersced69f82003-09-16 20:30:58 +00005501static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502int fixlower(PyUnicodeObject *self)
5503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005504 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 Py_UNICODE *s = self->str;
5506 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005507
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005509 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005510
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 ch = Py_UNICODE_TOLOWER(*s);
5512 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005514 *s = ch;
5515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 s++;
5517 }
5518
5519 return status;
5520}
5521
Tim Petersced69f82003-09-16 20:30:58 +00005522static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523int fixswapcase(PyUnicodeObject *self)
5524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 Py_UNICODE *s = self->str;
5527 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005528
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 while (len-- > 0) {
5530 if (Py_UNICODE_ISUPPER(*s)) {
5531 *s = Py_UNICODE_TOLOWER(*s);
5532 status = 1;
5533 } else if (Py_UNICODE_ISLOWER(*s)) {
5534 *s = Py_UNICODE_TOUPPER(*s);
5535 status = 1;
5536 }
5537 s++;
5538 }
5539
5540 return status;
5541}
5542
Tim Petersced69f82003-09-16 20:30:58 +00005543static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544int fixcapitalize(PyUnicodeObject *self)
5545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005546 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005547 Py_UNICODE *s = self->str;
5548 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005549
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005550 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005551 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005552 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005553 *s = Py_UNICODE_TOUPPER(*s);
5554 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005556 s++;
5557 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005558 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005559 *s = Py_UNICODE_TOLOWER(*s);
5560 status = 1;
5561 }
5562 s++;
5563 }
5564 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565}
5566
5567static
5568int fixtitle(PyUnicodeObject *self)
5569{
5570 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5571 register Py_UNICODE *e;
5572 int previous_is_cased;
5573
5574 /* Shortcut for single character strings */
5575 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005576 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5577 if (*p != ch) {
5578 *p = ch;
5579 return 1;
5580 }
5581 else
5582 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Tim Petersced69f82003-09-16 20:30:58 +00005584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 e = p + PyUnicode_GET_SIZE(self);
5586 previous_is_cased = 0;
5587 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005588 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005589
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005590 if (previous_is_cased)
5591 *p = Py_UNICODE_TOLOWER(ch);
5592 else
5593 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005594
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005595 if (Py_UNICODE_ISLOWER(ch) ||
5596 Py_UNICODE_ISUPPER(ch) ||
5597 Py_UNICODE_ISTITLE(ch))
5598 previous_is_cased = 1;
5599 else
5600 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 }
5602 return 1;
5603}
5604
Tim Peters8ce9f162004-08-27 01:49:32 +00005605PyObject *
5606PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005609 const Py_UNICODE blank = ' ';
5610 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005611 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005613 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5614 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005615 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5616 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005617 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005618 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005619 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
Tim Peters05eba1f2004-08-27 21:32:02 +00005621 fseq = PySequence_Fast(seq, "");
5622 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005623 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005624 }
5625
Tim Peters91879ab2004-08-27 22:35:44 +00005626 /* Grrrr. A codec may be invoked to convert str objects to
5627 * Unicode, and so it's possible to call back into Python code
5628 * during PyUnicode_FromObject(), and so it's possible for a sick
5629 * codec to change the size of fseq (if seq is a list). Therefore
5630 * we have to keep refetching the size -- can't assume seqlen
5631 * is invariant.
5632 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 seqlen = PySequence_Fast_GET_SIZE(fseq);
5634 /* If empty sequence, return u"". */
5635 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005636 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5637 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005638 }
5639 /* If singleton sequence with an exact Unicode, return that. */
5640 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 item = PySequence_Fast_GET_ITEM(fseq, 0);
5642 if (PyUnicode_CheckExact(item)) {
5643 Py_INCREF(item);
5644 res = (PyUnicodeObject *)item;
5645 goto Done;
5646 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005647 }
5648
Tim Peters05eba1f2004-08-27 21:32:02 +00005649 /* At least two items to join, or one that isn't exact Unicode. */
5650 if (seqlen > 1) {
5651 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005652 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005653 sep = &blank;
5654 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005655 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005656 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005657 internal_separator = PyUnicode_FromObject(separator);
5658 if (internal_separator == NULL)
5659 goto onError;
5660 sep = PyUnicode_AS_UNICODE(internal_separator);
5661 seplen = PyUnicode_GET_SIZE(internal_separator);
5662 /* In case PyUnicode_FromObject() mutated seq. */
5663 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005664 }
5665 }
5666
5667 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005668 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005669 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005670 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 res_p = PyUnicode_AS_UNICODE(res);
5672 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005673
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005675 Py_ssize_t itemlen;
5676 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005677
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005678 item = PySequence_Fast_GET_ITEM(fseq, i);
5679 /* Convert item to Unicode. */
5680 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5681 PyErr_Format(PyExc_TypeError,
5682 "sequence item %zd: expected string or Unicode,"
5683 " %.80s found",
5684 i, Py_TYPE(item)->tp_name);
5685 goto onError;
5686 }
5687 item = PyUnicode_FromObject(item);
5688 if (item == NULL)
5689 goto onError;
5690 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005691
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005692 /* In case PyUnicode_FromObject() mutated seq. */
5693 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005694
Tim Peters8ce9f162004-08-27 01:49:32 +00005695 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005696 itemlen = PyUnicode_GET_SIZE(item);
5697 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005698 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005699 goto Overflow;
5700 if (i < seqlen - 1) {
5701 new_res_used += seplen;
5702 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005703 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005704 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005705 if (new_res_used > res_alloc) {
5706 /* double allocated size until it's big enough */
5707 do {
5708 res_alloc += res_alloc;
5709 if (res_alloc <= 0)
5710 goto Overflow;
5711 } while (new_res_used > res_alloc);
5712 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5713 Py_DECREF(item);
5714 goto onError;
5715 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005716 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005717 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005718
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005719 /* Copy item, and maybe the separator. */
5720 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5721 res_p += itemlen;
5722 if (i < seqlen - 1) {
5723 Py_UNICODE_COPY(res_p, sep, seplen);
5724 res_p += seplen;
5725 }
5726 Py_DECREF(item);
5727 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005728 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005729
Tim Peters05eba1f2004-08-27 21:32:02 +00005730 /* Shrink res to match the used area; this probably can't fail,
5731 * but it's cheap to check.
5732 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005733 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005734 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005735
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005736 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005737 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005738 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return (PyObject *)res;
5740
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005741 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005742 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005743 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005744 Py_DECREF(item);
5745 /* fall through */
5746
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005747 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005748 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005749 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005750 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 return NULL;
5752}
5753
Tim Petersced69f82003-09-16 20:30:58 +00005754static
5755PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005756 Py_ssize_t left,
5757 Py_ssize_t right,
5758 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759{
5760 PyUnicodeObject *u;
5761
5762 if (left < 0)
5763 left = 0;
5764 if (right < 0)
5765 right = 0;
5766
Tim Peters7a29bd52001-09-12 03:03:31 +00005767 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 Py_INCREF(self);
5769 return self;
5770 }
5771
Neal Norwitze7d8be82008-07-31 17:17:14 +00005772 if (left > PY_SSIZE_T_MAX - self->length ||
5773 right > PY_SSIZE_T_MAX - (left + self->length)) {
5774 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5775 return NULL;
5776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 u = _PyUnicode_New(left + self->length + right);
5778 if (u) {
5779 if (left)
5780 Py_UNICODE_FILL(u->str, fill, left);
5781 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5782 if (right)
5783 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5784 }
5785
5786 return u;
5787}
5788
Antoine Pitrou64672132010-01-13 07:55:48 +00005789PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
5793 string = PyUnicode_FromObject(string);
5794 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Antoine Pitrou64672132010-01-13 07:55:48 +00005797 list = stringlib_splitlines(
5798 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5799 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 Py_DECREF(string);
5802 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803}
5804
Tim Petersced69f82003-09-16 20:30:58 +00005805static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005807 PyUnicodeObject *substring,
5808 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005811 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005814 return stringlib_split_whitespace(
5815 (PyObject*) self, self->str, self->length, maxcount
5816 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Antoine Pitrou64672132010-01-13 07:55:48 +00005818 return stringlib_split(
5819 (PyObject*) self, self->str, self->length,
5820 substring->str, substring->length,
5821 maxcount
5822 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823}
5824
Tim Petersced69f82003-09-16 20:30:58 +00005825static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005827 PyUnicodeObject *substring,
5828 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005829{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005831 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005833 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005834 return stringlib_rsplit_whitespace(
5835 (PyObject*) self, self->str, self->length, maxcount
5836 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837
Antoine Pitrou64672132010-01-13 07:55:48 +00005838 return stringlib_rsplit(
5839 (PyObject*) self, self->str, self->length,
5840 substring->str, substring->length,
5841 maxcount
5842 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005843}
5844
5845static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005847 PyUnicodeObject *str1,
5848 PyUnicodeObject *str2,
5849 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850{
5851 PyUnicodeObject *u;
5852
5853 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005854 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005855 else if (maxcount == 0 || self->length == 0)
5856 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
Fredrik Lundh347ee272006-05-24 16:35:18 +00005858 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005859 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005860 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005861 if (str1->length == 0)
5862 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005863 if (str1->length == 1) {
5864 /* replace characters */
5865 Py_UNICODE u1, u2;
5866 if (!findchar(self->str, self->length, str1->str[0]))
5867 goto nothing;
5868 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5869 if (!u)
5870 return NULL;
5871 Py_UNICODE_COPY(u->str, self->str, self->length);
5872 u1 = str1->str[0];
5873 u2 = str2->str[0];
5874 for (i = 0; i < u->length; i++)
5875 if (u->str[i] == u1) {
5876 if (--maxcount < 0)
5877 break;
5878 u->str[i] = u2;
5879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005881 i = stringlib_find(
5882 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005884 if (i < 0)
5885 goto nothing;
5886 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5887 if (!u)
5888 return NULL;
5889 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005890
5891 /* change everything in-place, starting with this one */
5892 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5893 i += str1->length;
5894
5895 while ( --maxcount > 0) {
5896 i = stringlib_find(self->str+i, self->length-i,
5897 str1->str, str1->length,
5898 i);
5899 if (i == -1)
5900 break;
5901 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5902 i += str1->length;
5903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005906
Brett Cannona7f13ee2010-05-04 01:16:51 +00005907 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005908 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 Py_UNICODE *p;
5910
5911 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005912 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5913 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914 if (n == 0)
5915 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005916 /* new_size = self->length + n * (str2->length - str1->length)); */
5917 delta = (str2->length - str1->length);
5918 if (delta == 0) {
5919 new_size = self->length;
5920 } else {
5921 product = n * (str2->length - str1->length);
5922 if ((product / (str2->length - str1->length)) != n) {
5923 PyErr_SetString(PyExc_OverflowError,
5924 "replace string is too long");
5925 return NULL;
5926 }
5927 new_size = self->length + product;
5928 if (new_size < 0) {
5929 PyErr_SetString(PyExc_OverflowError,
5930 "replace string is too long");
5931 return NULL;
5932 }
5933 }
5934 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005935 if (!u)
5936 return NULL;
5937 i = 0;
5938 p = u->str;
5939 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005940 while (n-- > 0) {
5941 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005942 j = stringlib_find(self->str+i, self->length-i,
5943 str1->str, str1->length,
5944 i);
5945 if (j == -1)
5946 break;
5947 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005948 /* copy unchanged part [i:j] */
5949 Py_UNICODE_COPY(p, self->str+i, j-i);
5950 p += j - i;
5951 }
5952 /* copy substitution string */
5953 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005954 Py_UNICODE_COPY(p, str2->str, str2->length);
5955 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005956 }
5957 i = j + str1->length;
5958 }
5959 if (i < self->length)
5960 /* copy tail [i:] */
5961 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005962 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005963 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005964 while (n > 0) {
5965 Py_UNICODE_COPY(p, str2->str, str2->length);
5966 p += str2->length;
5967 if (--n <= 0)
5968 break;
5969 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005971 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
5973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005975
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005976 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005977 /* nothing to replace; return original string (when possible) */
5978 if (PyUnicode_CheckExact(self)) {
5979 Py_INCREF(self);
5980 return (PyObject *) self;
5981 }
5982 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983}
5984
5985/* --- Unicode Object Methods --------------------------------------------- */
5986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005987PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005988 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989\n\
5990Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005991characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
5993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005994unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 return fixup(self, fixtitle);
5997}
5998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005999PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006000 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001\n\
6002Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00006003have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
6005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006006unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 return fixup(self, fixcapitalize);
6009}
6010
6011#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006012PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006013 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014\n\
6015Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
6018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006019unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
6021 PyObject *list;
6022 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 /* Split into words */
6026 list = split(self, NULL, -1);
6027 if (!list)
6028 return NULL;
6029
6030 /* Capitalize each word */
6031 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6032 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006033 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 if (item == NULL)
6035 goto onError;
6036 Py_DECREF(PyList_GET_ITEM(list, i));
6037 PyList_SET_ITEM(list, i, item);
6038 }
6039
6040 /* Join the words to form a new string */
6041 item = PyUnicode_Join(NULL, list);
6042
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 Py_DECREF(list);
6045 return (PyObject *)item;
6046}
6047#endif
6048
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006049/* Argument converter. Coerces to a single unicode character */
6050
6051static int
6052convert_uc(PyObject *obj, void *addr)
6053{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006054 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6055 PyObject *uniobj;
6056 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006057
Benjamin Peterson857ce152009-01-31 16:29:18 +00006058 uniobj = PyUnicode_FromObject(obj);
6059 if (uniobj == NULL) {
6060 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006061 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006062 return 0;
6063 }
6064 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6065 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006066 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006067 Py_DECREF(uniobj);
6068 return 0;
6069 }
6070 unistr = PyUnicode_AS_UNICODE(uniobj);
6071 *fillcharloc = unistr[0];
6072 Py_DECREF(uniobj);
6073 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006074}
6075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006076PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006077 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006079Return S centered in a Unicode string of length width. Padding is\n\
6080done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
6082static PyObject *
6083unicode_center(PyUnicodeObject *self, PyObject *args)
6084{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006085 Py_ssize_t marg, left;
6086 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006087 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Thomas Woutersde017742006-02-16 19:34:37 +00006089 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 return NULL;
6091
Tim Peters7a29bd52001-09-12 03:03:31 +00006092 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 Py_INCREF(self);
6094 return (PyObject*) self;
6095 }
6096
6097 marg = width - self->length;
6098 left = marg / 2 + (marg & width & 1);
6099
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006100 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101}
6102
Marc-André Lemburge5034372000-08-08 08:04:29 +00006103#if 0
6104
6105/* This code should go into some future Unicode collation support
6106 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006107 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006109/* speedy UTF-16 code point order comparison */
6110/* gleaned from: */
6111/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6112
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006113static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006114{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006115 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006116 0, 0, 0, 0, 0, 0, 0, 0,
6117 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006118 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006119};
6120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121static int
6122unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006124 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 Py_UNICODE *s1 = str1->str;
6127 Py_UNICODE *s2 = str2->str;
6128
6129 len1 = str1->length;
6130 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006133 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006134
6135 c1 = *s1++;
6136 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006137
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 if (c1 > (1<<11) * 26)
6139 c1 += utf16Fixup[c1>>11];
6140 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006141 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006143
6144 if (c1 != c2)
6145 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006146
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
6149
6150 return (len1 < len2) ? -1 : (len1 != len2);
6151}
6152
Marc-André Lemburge5034372000-08-08 08:04:29 +00006153#else
6154
6155static int
6156unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6157{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006158 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006159
6160 Py_UNICODE *s1 = str1->str;
6161 Py_UNICODE *s2 = str2->str;
6162
6163 len1 = str1->length;
6164 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006165
Marc-André Lemburge5034372000-08-08 08:04:29 +00006166 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006167 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006168
Fredrik Lundh45714e92001-06-26 16:39:36 +00006169 c1 = *s1++;
6170 c2 = *s2++;
6171
6172 if (c1 != c2)
6173 return (c1 < c2) ? -1 : 1;
6174
Marc-André Lemburge5034372000-08-08 08:04:29 +00006175 len1--; len2--;
6176 }
6177
6178 return (len1 < len2) ? -1 : (len1 != len2);
6179}
6180
6181#endif
6182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006184 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185{
6186 PyUnicodeObject *u = NULL, *v = NULL;
6187 int result;
6188
6189 /* Coerce the two arguments */
6190 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6191 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6194 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Thomas Wouters7e474022000-07-16 12:04:32 +00006197 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006199 Py_DECREF(u);
6200 Py_DECREF(v);
6201 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 }
6203
6204 result = unicode_compare(u, v);
6205
6206 Py_DECREF(u);
6207 Py_DECREF(v);
6208 return result;
6209
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 Py_XDECREF(u);
6212 Py_XDECREF(v);
6213 return -1;
6214}
6215
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006216PyObject *PyUnicode_RichCompare(PyObject *left,
6217 PyObject *right,
6218 int op)
6219{
6220 int result;
6221
6222 result = PyUnicode_Compare(left, right);
6223 if (result == -1 && PyErr_Occurred())
6224 goto onError;
6225
6226 /* Convert the return value to a Boolean */
6227 switch (op) {
6228 case Py_EQ:
6229 result = (result == 0);
6230 break;
6231 case Py_NE:
6232 result = (result != 0);
6233 break;
6234 case Py_LE:
6235 result = (result <= 0);
6236 break;
6237 case Py_GE:
6238 result = (result >= 0);
6239 break;
6240 case Py_LT:
6241 result = (result == -1);
6242 break;
6243 case Py_GT:
6244 result = (result == 1);
6245 break;
6246 }
6247 return PyBool_FromLong(result);
6248
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006249 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006250
6251 /* Standard case
6252
6253 Type errors mean that PyUnicode_FromObject() could not convert
6254 one of the arguments (usually the right hand side) to Unicode,
6255 ie. we can't handle the comparison request. However, it is
6256 possible that the other object knows a comparison method, which
6257 is why we return Py_NotImplemented to give the other object a
6258 chance.
6259
6260 */
6261 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6262 PyErr_Clear();
6263 Py_INCREF(Py_NotImplemented);
6264 return Py_NotImplemented;
6265 }
6266 if (op != Py_EQ && op != Py_NE)
6267 return NULL;
6268
6269 /* Equality comparison.
6270
6271 This is a special case: we silence any PyExc_UnicodeDecodeError
6272 and instead turn it into a PyErr_UnicodeWarning.
6273
6274 */
6275 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6276 return NULL;
6277 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006278 if (PyErr_Warn(PyExc_UnicodeWarning,
6279 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006280 "Unicode equal comparison "
6281 "failed to convert both arguments to Unicode - "
6282 "interpreting them as being unequal" :
6283 "Unicode unequal comparison "
6284 "failed to convert both arguments to Unicode - "
6285 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006286 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006287 return NULL;
6288 result = (op == Py_NE);
6289 return PyBool_FromLong(result);
6290}
6291
Guido van Rossum403d68b2000-03-13 15:55:09 +00006292int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006293 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006294{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006295 PyObject *str, *sub;
6296 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006297
6298 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006299 sub = PyUnicode_FromObject(element);
6300 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006301 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006302 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006303
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006304 str = PyUnicode_FromObject(container);
6305 if (!str) {
6306 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006307 return -1;
6308 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006309
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006310 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006311
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006312 Py_DECREF(str);
6313 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006314
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006315 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006316}
6317
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318/* Concat to string or Unicode object giving a new Unicode object. */
6319
6320PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006321 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322{
6323 PyUnicodeObject *u = NULL, *v = NULL, *w;
6324
6325 /* Coerce the two arguments */
6326 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6327 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006328 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6330 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006331 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333 /* Shortcuts */
6334 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006335 Py_DECREF(v);
6336 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 }
6338 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006339 Py_DECREF(u);
6340 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 }
6342
6343 /* Concat the two Unicode strings */
6344 w = _PyUnicode_New(u->length + v->length);
6345 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006346 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 Py_UNICODE_COPY(w->str, u->str, u->length);
6348 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6349
6350 Py_DECREF(u);
6351 Py_DECREF(v);
6352 return (PyObject *)w;
6353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006354 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 Py_XDECREF(u);
6356 Py_XDECREF(v);
6357 return NULL;
6358}
6359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006360PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006361 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006363Return the number of non-overlapping occurrences of substring sub in\n\
6364Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006365interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
6367static PyObject *
6368unicode_count(PyUnicodeObject *self, PyObject *args)
6369{
6370 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006371 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006372 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 PyObject *result;
6374
Jesus Cea44e81682011-04-20 16:39:15 +02006375 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6376 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006377 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006378
Antoine Pitrou64672132010-01-13 07:55:48 +00006379 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006380 result = PyInt_FromSsize_t(
6381 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006382 substring->str, substring->length,
6383 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006384 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385
6386 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 return result;
6389}
6390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006391PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006392 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006394Encodes S using the codec registered for encoding. encoding defaults\n\
6395to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006396handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6398'xmlcharrefreplace' as well as any other name registered with\n\
6399codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006402unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006404 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 char *encoding = NULL;
6406 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006407 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006408
Benjamin Peterson332d7212009-09-18 21:14:55 +00006409 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6410 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006412 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006413 if (v == NULL)
6414 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006415 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006416 PyErr_Format(PyExc_TypeError,
6417 "encoder did not return a string/unicode object "
6418 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006419 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006420 Py_DECREF(v);
6421 return NULL;
6422 }
6423 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006424
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006425 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006426 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006427}
6428
6429PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006430 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006431\n\
6432Decodes S using the codec registered for encoding. encoding defaults\n\
6433to the default encoding. errors may be given to set a different error\n\
6434handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6435a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006436as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006437able to handle UnicodeDecodeErrors.");
6438
6439static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006440unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006441{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006442 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443 char *encoding = NULL;
6444 char *errors = NULL;
6445 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006446
Benjamin Peterson332d7212009-09-18 21:14:55 +00006447 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6448 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006449 return NULL;
6450 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006451 if (v == NULL)
6452 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006453 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006454 PyErr_Format(PyExc_TypeError,
6455 "decoder did not return a string/unicode object "
6456 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006457 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006458 Py_DECREF(v);
6459 return NULL;
6460 }
6461 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006462
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006463 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465}
6466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006467PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006468 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469\n\
6470Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006471If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
6473static PyObject*
6474unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6475{
6476 Py_UNICODE *e;
6477 Py_UNICODE *p;
6478 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006479 Py_UNICODE *qe;
6480 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 PyUnicodeObject *u;
6482 int tabsize = 8;
6483
6484 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
Thomas Wouters7e474022000-07-16 12:04:32 +00006487 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006488 i = 0; /* chars up to and including most recent \n or \r */
6489 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6490 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 for (p = self->str; p < e; p++)
6492 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006493 if (tabsize > 0) {
6494 incr = tabsize - (j % tabsize); /* cannot overflow */
6495 if (j > PY_SSIZE_T_MAX - incr)
6496 goto overflow1;
6497 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006498 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006501 if (j > PY_SSIZE_T_MAX - 1)
6502 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 j++;
6504 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006505 if (i > PY_SSIZE_T_MAX - j)
6506 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006508 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
6510 }
6511
Guido van Rossum5bdff602008-03-11 21:18:06 +00006512 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006513 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006514
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 /* Second pass: create output string and fill it */
6516 u = _PyUnicode_New(i + j);
6517 if (!u)
6518 return NULL;
6519
Guido van Rossum5bdff602008-03-11 21:18:06 +00006520 j = 0; /* same as in first pass */
6521 q = u->str; /* next output char */
6522 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524 for (p = self->str; p < e; p++)
6525 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006526 if (tabsize > 0) {
6527 i = tabsize - (j % tabsize);
6528 j += i;
6529 while (i--) {
6530 if (q >= qe)
6531 goto overflow2;
6532 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006533 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006534 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006535 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006536 else {
6537 if (q >= qe)
6538 goto overflow2;
6539 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006540 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 if (*p == '\n' || *p == '\r')
6542 j = 0;
6543 }
6544
6545 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006546
6547 overflow2:
6548 Py_DECREF(u);
6549 overflow1:
6550 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006555 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556\n\
6557Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006558such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559arguments start and end are interpreted as in slice notation.\n\
6560\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006561Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563static PyObject *
6564unicode_find(PyUnicodeObject *self, PyObject *args)
6565{
Jesus Cea44e81682011-04-20 16:39:15 +02006566 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006567 Py_ssize_t start;
6568 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006569 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
Jesus Cea44e81682011-04-20 16:39:15 +02006571 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6572 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006575 result = stringlib_find_slice(
6576 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6577 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6578 start, end
6579 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580
6581 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006582
6583 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
6586static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006587unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588{
6589 if (index < 0 || index >= self->length) {
6590 PyErr_SetString(PyExc_IndexError, "string index out of range");
6591 return NULL;
6592 }
6593
6594 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6595}
6596
6597static long
6598unicode_hash(PyUnicodeObject *self)
6599{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006600 /* Since Unicode objects compare equal to their ASCII string
6601 counterparts, they should use the individual character values
6602 as basis for their hash value. This is needed to assure that
6603 strings and Unicode objects behave in the same way as
6604 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
Martin v. Löwis18e16552006-02-15 17:27:45 +00006606 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006607 register Py_UNICODE *p;
6608 register long x;
6609
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006610#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006611 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006612#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006615 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006616 /*
6617 We make the hash of the empty string be 0, rather than using
6618 (prefix ^ suffix), since this slightly obfuscates the hash secret
6619 */
6620 if (len == 0) {
6621 self->hash = 0;
6622 return 0;
6623 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006624 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006625 x = _Py_HashSecret.prefix;
6626 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006627 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006628 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006629 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006630 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006631 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006632 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006633 self->hash = x;
6634 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006638 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642static PyObject *
6643unicode_index(PyUnicodeObject *self, PyObject *args)
6644{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006646 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006647 Py_ssize_t start;
6648 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
Jesus Cea44e81682011-04-20 16:39:15 +02006650 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6651 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006654 result = stringlib_find_slice(
6655 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6656 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6657 start, end
6658 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
6660 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 if (result < 0) {
6663 PyErr_SetString(PyExc_ValueError, "substring not found");
6664 return NULL;
6665 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006666
Martin v. Löwis18e16552006-02-15 17:27:45 +00006667 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
6676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006677unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678{
6679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6680 register const Py_UNICODE *e;
6681 int cased;
6682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 /* Shortcut for single character strings */
6684 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006688 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006689 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 e = p + PyUnicode_GET_SIZE(self);
6692 cased = 0;
6693 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006694 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6697 return PyBool_FromLong(0);
6698 else if (!cased && Py_UNICODE_ISLOWER(ch))
6699 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702}
6703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006705 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006707Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
6710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006711unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712{
6713 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6714 register const Py_UNICODE *e;
6715 int cased;
6716
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 /* Shortcut for single character strings */
6718 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006719 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006721 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006722 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 e = p + PyUnicode_GET_SIZE(self);
6726 cased = 0;
6727 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006729
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6731 return PyBool_FromLong(0);
6732 else if (!cased && Py_UNICODE_ISUPPER(ch))
6733 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736}
6737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006739 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006741Return True if S is a titlecased string and there is at least one\n\
6742character in S, i.e. upper- and titlecase characters may only\n\
6743follow uncased characters and lowercase characters only cased ones.\n\
6744Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
6746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006747unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
6749 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6750 register const Py_UNICODE *e;
6751 int cased, previous_is_cased;
6752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 /* Shortcut for single character strings */
6754 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6756 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006761
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 e = p + PyUnicode_GET_SIZE(self);
6763 cased = 0;
6764 previous_is_cased = 0;
6765 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006767
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006768 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6769 if (previous_is_cased)
6770 return PyBool_FromLong(0);
6771 previous_is_cased = 1;
6772 cased = 1;
6773 }
6774 else if (Py_UNICODE_ISLOWER(ch)) {
6775 if (!previous_is_cased)
6776 return PyBool_FromLong(0);
6777 previous_is_cased = 1;
6778 cased = 1;
6779 }
6780 else
6781 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784}
6785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006786PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006787 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006789Return True if all characters in S are whitespace\n\
6790and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
6792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006793unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
6795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6796 register const Py_UNICODE *e;
6797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 /* Shortcut for single character strings */
6799 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 Py_UNICODE_ISSPACE(*p))
6801 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006803 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006804 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006806
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 e = p + PyUnicode_GET_SIZE(self);
6808 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006809 if (!Py_UNICODE_ISSPACE(*p))
6810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813}
6814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006816 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006818Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006820
6821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006822unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006823{
6824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6825 register const Py_UNICODE *e;
6826
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827 /* Shortcut for single character strings */
6828 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 Py_UNICODE_ISALPHA(*p))
6830 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831
6832 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006833 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006834 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835
6836 e = p + PyUnicode_GET_SIZE(self);
6837 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006838 if (!Py_UNICODE_ISALPHA(*p))
6839 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006840 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842}
6843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006844PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006845 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006846\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006847Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849
6850static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006851unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006852{
6853 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6854 register const Py_UNICODE *e;
6855
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856 /* Shortcut for single character strings */
6857 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 Py_UNICODE_ISALNUM(*p))
6859 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860
6861 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006862 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006863 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864
6865 e = p + PyUnicode_GET_SIZE(self);
6866 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006867 if (!Py_UNICODE_ISALNUM(*p))
6868 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871}
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006874 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006877False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
6879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006880unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
6882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6883 register const Py_UNICODE *e;
6884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 /* Shortcut for single character strings */
6886 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006887 Py_UNICODE_ISDECIMAL(*p))
6888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006891 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006893
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 e = p + PyUnicode_GET_SIZE(self);
6895 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006896 if (!Py_UNICODE_ISDECIMAL(*p))
6897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900}
6901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006902PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006903 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006905Return True if all characters in S are digits\n\
6906and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
6908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006909unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910{
6911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6912 register const Py_UNICODE *e;
6913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 /* Shortcut for single character strings */
6915 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006916 Py_UNICODE_ISDIGIT(*p))
6917 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006920 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006921 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006922
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 e = p + PyUnicode_GET_SIZE(self);
6924 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006925 if (!Py_UNICODE_ISDIGIT(*p))
6926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929}
6930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006932 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006935False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936
6937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006938unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
6940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6941 register const Py_UNICODE *e;
6942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 /* Shortcut for single character strings */
6944 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006945 Py_UNICODE_ISNUMERIC(*p))
6946 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006949 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006951
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 e = p + PyUnicode_GET_SIZE(self);
6953 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006954 if (!Py_UNICODE_ISNUMERIC(*p))
6955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958}
6959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006961 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962\n\
6963Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006964iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006967unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006969 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970}
6971
Martin v. Löwis18e16552006-02-15 17:27:45 +00006972static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973unicode_length(PyUnicodeObject *self)
6974{
6975 return self->length;
6976}
6977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006978PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006979 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006981Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006982done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
6984static PyObject *
6985unicode_ljust(PyUnicodeObject *self, PyObject *args)
6986{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006987 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006988 Py_UNICODE fillchar = ' ';
6989
Martin v. Löwis412fb672006-04-13 06:34:32 +00006990 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 return NULL;
6992
Tim Peters7a29bd52001-09-12 03:03:31 +00006993 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 Py_INCREF(self);
6995 return (PyObject*) self;
6996 }
6997
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006998 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999}
7000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007001PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007002 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007004Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
7006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007007unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 return fixup(self, fixlower);
7010}
7011
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007012#define LEFTSTRIP 0
7013#define RIGHTSTRIP 1
7014#define BOTHSTRIP 2
7015
7016/* Arrays indexed by above */
7017static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7018
7019#define STRIPNAME(i) (stripformat[i]+3)
7020
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007021/* externally visible for str.strip(unicode) */
7022PyObject *
7023_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7024{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007025 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7026 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7027 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7028 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7029 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007030
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007031 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007032
Benjamin Peterson857ce152009-01-31 16:29:18 +00007033 i = 0;
7034 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007035 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7036 i++;
7037 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007038 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039
Benjamin Peterson857ce152009-01-31 16:29:18 +00007040 j = len;
7041 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007042 do {
7043 j--;
7044 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7045 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007046 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047
Benjamin Peterson857ce152009-01-31 16:29:18 +00007048 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007049 Py_INCREF(self);
7050 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007051 }
7052 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007053 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054}
7055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
7057static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007060 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7061 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062
Benjamin Peterson857ce152009-01-31 16:29:18 +00007063 i = 0;
7064 if (striptype != RIGHTSTRIP) {
7065 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7066 i++;
7067 }
7068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
Benjamin Peterson857ce152009-01-31 16:29:18 +00007070 j = len;
7071 if (striptype != LEFTSTRIP) {
7072 do {
7073 j--;
7074 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7075 j++;
7076 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077
Benjamin Peterson857ce152009-01-31 16:29:18 +00007078 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7079 Py_INCREF(self);
7080 return (PyObject*)self;
7081 }
7082 else
7083 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
7087static PyObject *
7088do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007091
Benjamin Peterson857ce152009-01-31 16:29:18 +00007092 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7093 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094
Benjamin Peterson857ce152009-01-31 16:29:18 +00007095 if (sep != NULL && sep != Py_None) {
7096 if (PyUnicode_Check(sep))
7097 return _PyUnicode_XStrip(self, striptype, sep);
7098 else if (PyString_Check(sep)) {
7099 PyObject *res;
7100 sep = PyUnicode_FromObject(sep);
7101 if (sep==NULL)
7102 return NULL;
7103 res = _PyUnicode_XStrip(self, striptype, sep);
7104 Py_DECREF(sep);
7105 return res;
7106 }
7107 else {
7108 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007109 "%s arg must be None, unicode or str",
7110 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007111 return NULL;
7112 }
7113 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114
Benjamin Peterson857ce152009-01-31 16:29:18 +00007115 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116}
7117
7118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007120 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121\n\
7122Return a copy of the string S with leading and trailing\n\
7123whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007124If chars is given and not None, remove characters in chars instead.\n\
7125If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007126
7127static PyObject *
7128unicode_strip(PyUnicodeObject *self, PyObject *args)
7129{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007130 if (PyTuple_GET_SIZE(args) == 0)
7131 return do_strip(self, BOTHSTRIP); /* Common case */
7132 else
7133 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134}
7135
7136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007138 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139\n\
7140Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007141If chars is given and not None, remove characters in chars instead.\n\
7142If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143
7144static PyObject *
7145unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7146{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007147 if (PyTuple_GET_SIZE(args) == 0)
7148 return do_strip(self, LEFTSTRIP); /* Common case */
7149 else
7150 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007151}
7152
7153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007154PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007155 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156\n\
7157Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007158If chars is given and not None, remove characters in chars instead.\n\
7159If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007160
7161static PyObject *
7162unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7163{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007164 if (PyTuple_GET_SIZE(args) == 0)
7165 return do_strip(self, RIGHTSTRIP); /* Common case */
7166 else
7167 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007168}
7169
7170
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173{
7174 PyUnicodeObject *u;
7175 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007176 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007177 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
7179 if (len < 0)
7180 len = 0;
7181
Tim Peters7a29bd52001-09-12 03:03:31 +00007182 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 /* no repeat, return original string */
7184 Py_INCREF(str);
7185 return (PyObject*) str;
7186 }
Tim Peters8f422462000-09-09 06:13:41 +00007187
7188 /* ensure # of chars needed doesn't overflow int and # of bytes
7189 * needed doesn't overflow size_t
7190 */
7191 nchars = len * str->length;
7192 if (len && nchars / len != str->length) {
7193 PyErr_SetString(PyExc_OverflowError,
7194 "repeated string is too long");
7195 return NULL;
7196 }
7197 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7198 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7199 PyErr_SetString(PyExc_OverflowError,
7200 "repeated string is too long");
7201 return NULL;
7202 }
7203 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 if (!u)
7205 return NULL;
7206
7207 p = u->str;
7208
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007209 if (str->length == 1 && len > 0) {
7210 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007211 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007212 Py_ssize_t done = 0; /* number of characters copied this far */
7213 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007214 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007215 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007216 }
7217 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007218 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007219 Py_UNICODE_COPY(p+done, p, n);
7220 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007221 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
7224 return (PyObject*) u;
7225}
7226
7227PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007228 PyObject *subobj,
7229 PyObject *replobj,
7230 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231{
7232 PyObject *self;
7233 PyObject *str1;
7234 PyObject *str2;
7235 PyObject *result;
7236
7237 self = PyUnicode_FromObject(obj);
7238 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 str1 = PyUnicode_FromObject(subobj);
7241 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007242 Py_DECREF(self);
7243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 }
7245 str2 = PyUnicode_FromObject(replobj);
7246 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007247 Py_DECREF(self);
7248 Py_DECREF(str1);
7249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 }
Tim Petersced69f82003-09-16 20:30:58 +00007251 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007252 (PyUnicodeObject *)str1,
7253 (PyUnicodeObject *)str2,
7254 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 Py_DECREF(self);
7256 Py_DECREF(str1);
7257 Py_DECREF(str2);
7258 return result;
7259}
7260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007262 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263\n\
7264Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007265old replaced by new. If the optional argument count is\n\
7266given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267
7268static PyObject*
7269unicode_replace(PyUnicodeObject *self, PyObject *args)
7270{
7271 PyUnicodeObject *str1;
7272 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 PyObject *result;
7275
Martin v. Löwis18e16552006-02-15 17:27:45 +00007276 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 return NULL;
7278 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7279 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007282 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007283 Py_DECREF(str1);
7284 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286
7287 result = replace(self, str1, str2, maxcount);
7288
7289 Py_DECREF(str1);
7290 Py_DECREF(str2);
7291 return result;
7292}
7293
7294static
7295PyObject *unicode_repr(PyObject *unicode)
7296{
7297 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007298 PyUnicode_GET_SIZE(unicode),
7299 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300}
7301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007302PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007303 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304\n\
7305Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007306such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307arguments start and end are interpreted as in slice notation.\n\
7308\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310
7311static PyObject *
7312unicode_rfind(PyUnicodeObject *self, PyObject *args)
7313{
Jesus Cea44e81682011-04-20 16:39:15 +02007314 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007315 Py_ssize_t start;
7316 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007317 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
Jesus Cea44e81682011-04-20 16:39:15 +02007319 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7320 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007323 result = stringlib_rfind_slice(
7324 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7325 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7326 start, end
7327 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328
7329 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007330
7331 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332}
7333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007334PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007335 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
7339static PyObject *
7340unicode_rindex(PyUnicodeObject *self, PyObject *args)
7341{
Jesus Cea44e81682011-04-20 16:39:15 +02007342 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007343 Py_ssize_t start;
7344 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007345 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
Jesus Cea44e81682011-04-20 16:39:15 +02007347 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7348 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007351 result = stringlib_rfind_slice(
7352 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7353 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7354 start, end
7355 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007358
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 if (result < 0) {
7360 PyErr_SetString(PyExc_ValueError, "substring not found");
7361 return NULL;
7362 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364}
7365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007367 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007369Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007370done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371
7372static PyObject *
7373unicode_rjust(PyUnicodeObject *self, PyObject *args)
7374{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007375 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007376 Py_UNICODE fillchar = ' ';
7377
Martin v. Löwis412fb672006-04-13 06:34:32 +00007378 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 return NULL;
7380
Tim Peters7a29bd52001-09-12 03:03:31 +00007381 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 Py_INCREF(self);
7383 return (PyObject*) self;
7384 }
7385
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007386 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387}
7388
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391{
7392 /* standard clamping */
7393 if (start < 0)
7394 start = 0;
7395 if (end < 0)
7396 end = 0;
7397 if (end > self->length)
7398 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007399 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 /* full slice, return original string */
7401 Py_INCREF(self);
7402 return (PyObject*) self;
7403 }
7404 if (start > end)
7405 start = end;
7406 /* copy slice */
7407 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007408 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409}
7410
7411PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007412 PyObject *sep,
7413 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414{
7415 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 s = PyUnicode_FromObject(s);
7418 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007419 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007420 if (sep != NULL) {
7421 sep = PyUnicode_FromObject(sep);
7422 if (sep == NULL) {
7423 Py_DECREF(s);
7424 return NULL;
7425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
7427
7428 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7429
7430 Py_DECREF(s);
7431 Py_XDECREF(sep);
7432 return result;
7433}
7434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007436 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437\n\
7438Return a list of the words in S, using sep as the\n\
7439delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007440splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007441whitespace string is a separator and empty strings are\n\
7442removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
7444static PyObject*
7445unicode_split(PyUnicodeObject *self, PyObject *args)
7446{
7447 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
Martin v. Löwis18e16552006-02-15 17:27:45 +00007450 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
7452
7453 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007454 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007456 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007458 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459}
7460
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007461PyObject *
7462PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7463{
7464 PyObject* str_obj;
7465 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007466 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007467
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007468 str_obj = PyUnicode_FromObject(str_in);
7469 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007470 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007471 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007472 if (!sep_obj) {
7473 Py_DECREF(str_obj);
7474 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007475 }
7476
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007477 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007478 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7479 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7480 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007481
Fredrik Lundhb9479482006-05-26 17:22:38 +00007482 Py_DECREF(sep_obj);
7483 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007484
7485 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007486}
7487
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007488
7489PyObject *
7490PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7491{
7492 PyObject* str_obj;
7493 PyObject* sep_obj;
7494 PyObject* out;
7495
7496 str_obj = PyUnicode_FromObject(str_in);
7497 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007498 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007499 sep_obj = PyUnicode_FromObject(sep_in);
7500 if (!sep_obj) {
7501 Py_DECREF(str_obj);
7502 return NULL;
7503 }
7504
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007505 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007506 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7507 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7508 );
7509
7510 Py_DECREF(sep_obj);
7511 Py_DECREF(str_obj);
7512
7513 return out;
7514}
7515
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007516PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007517 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007519Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007521found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007522
7523static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007524unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007525{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007526 return PyUnicode_Partition((PyObject *)self, separator);
7527}
7528
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007529PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007530 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007531\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007532Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007533the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007534separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007535
7536static PyObject*
7537unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7538{
7539 return PyUnicode_RPartition((PyObject *)self, separator);
7540}
7541
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007542PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007543 PyObject *sep,
7544 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007545{
7546 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007547
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007548 s = PyUnicode_FromObject(s);
7549 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007550 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007551 if (sep != NULL) {
7552 sep = PyUnicode_FromObject(sep);
7553 if (sep == NULL) {
7554 Py_DECREF(s);
7555 return NULL;
7556 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007557 }
7558
7559 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7560
7561 Py_DECREF(s);
7562 Py_XDECREF(sep);
7563 return result;
7564}
7565
7566PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007567 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007568\n\
7569Return a list of the words in S, using sep as the\n\
7570delimiter string, starting at the end of the string and\n\
7571working to the front. If maxsplit is given, at most maxsplit\n\
7572splits are done. If sep is not specified, any whitespace string\n\
7573is a separator.");
7574
7575static PyObject*
7576unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7577{
7578 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007579 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007580
Martin v. Löwis18e16552006-02-15 17:27:45 +00007581 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007582 return NULL;
7583
7584 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007585 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007586 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007587 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007588 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007589 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007590}
7591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007592PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007593 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594\n\
7595Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007596Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
7599static PyObject*
7600unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7601{
Guido van Rossum86662912000-04-11 15:38:46 +00007602 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Guido van Rossum86662912000-04-11 15:38:46 +00007604 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 return NULL;
7606
Guido van Rossum86662912000-04-11 15:38:46 +00007607 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608}
7609
7610static
7611PyObject *unicode_str(PyUnicodeObject *self)
7612{
Fred Drakee4315f52000-05-09 19:53:39 +00007613 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007617 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
7619Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 return fixup(self, fixswapcase);
7626}
7627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007629 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630\n\
7631Return a copy of the string S, where all characters have been mapped\n\
7632through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007633Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7634Unmapped characters are left untouched. Characters mapped to None\n\
7635are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636
7637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007638unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639{
Tim Petersced69f82003-09-16 20:30:58 +00007640 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007641 self->length,
7642 table,
7643 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007647 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007652unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654 return fixup(self, fixupper);
7655}
7656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007657PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007658 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659\n\
Georg Brandl98064072008-09-09 19:26:00 +00007660Pad a numeric string S with zeros on the left, to fill a field\n\
7661of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663static PyObject *
7664unicode_zfill(PyUnicodeObject *self, PyObject *args)
7665{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007666 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 PyUnicodeObject *u;
7668
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669 Py_ssize_t width;
7670 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 return NULL;
7672
7673 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007674 if (PyUnicode_CheckExact(self)) {
7675 Py_INCREF(self);
7676 return (PyObject*) self;
7677 }
7678 else
7679 return PyUnicode_FromUnicode(
7680 PyUnicode_AS_UNICODE(self),
7681 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007682 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 }
7684
7685 fill = width - self->length;
7686
7687 u = pad(self, fill, 0, '0');
7688
Walter Dörwald068325e2002-04-15 13:36:47 +00007689 if (u == NULL)
7690 return NULL;
7691
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 if (u->str[fill] == '+' || u->str[fill] == '-') {
7693 /* move sign to beginning of string */
7694 u->str[0] = u->str[fill];
7695 u->str[fill] = '0';
7696 }
7697
7698 return (PyObject*) u;
7699}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701#if 0
7702static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007703free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007705 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706}
7707#endif
7708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007709PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007710 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007712Return True if S starts with the specified prefix, False otherwise.\n\
7713With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007714With optional end, stop comparing S at that position.\n\
7715prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
7717static PyObject *
7718unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007719 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Georg Brandl24250812006-06-09 18:45:48 +00007721 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007724 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007725 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Jesus Cea44e81682011-04-20 16:39:15 +02007727 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007728 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007729 if (PyTuple_Check(subobj)) {
7730 Py_ssize_t i;
7731 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7732 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007733 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007734 if (substring == NULL)
7735 return NULL;
7736 result = tailmatch(self, substring, start, end, -1);
7737 Py_DECREF(substring);
7738 if (result) {
7739 Py_RETURN_TRUE;
7740 }
7741 }
7742 /* nothing matched */
7743 Py_RETURN_FALSE;
7744 }
7745 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007746 if (substring == NULL) {
7747 if (PyErr_ExceptionMatches(PyExc_TypeError))
7748 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7749 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007750 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007751 }
Georg Brandl24250812006-06-09 18:45:48 +00007752 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007754 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755}
7756
7757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007759 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007761Return True if S ends with the specified suffix, False otherwise.\n\
7762With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007763With optional end, stop comparing S at that position.\n\
7764suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject *
7767unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007768 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769{
Georg Brandl24250812006-06-09 18:45:48 +00007770 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007772 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007773 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007774 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
Jesus Cea44e81682011-04-20 16:39:15 +02007776 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007777 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007778 if (PyTuple_Check(subobj)) {
7779 Py_ssize_t i;
7780 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7781 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007782 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007783 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007784 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007785 result = tailmatch(self, substring, start, end, +1);
7786 Py_DECREF(substring);
7787 if (result) {
7788 Py_RETURN_TRUE;
7789 }
7790 }
7791 Py_RETURN_FALSE;
7792 }
7793 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007794 if (substring == NULL) {
7795 if (PyErr_ExceptionMatches(PyExc_TypeError))
7796 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7797 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007798 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007799 }
Georg Brandl24250812006-06-09 18:45:48 +00007800 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007802 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803}
7804
7805
Eric Smitha9f7d622008-02-17 19:46:49 +00007806/* Implements do_string_format, which is unicode because of stringlib */
7807#include "stringlib/string_format.h"
7808
7809PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007810 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007811\n\
Eric Smith6c840852010-11-06 19:43:44 +00007812Return a formatted version of S, using substitutions from args and kwargs.\n\
7813The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007814
Eric Smithdc13b792008-05-30 18:10:04 +00007815static PyObject *
7816unicode__format__(PyObject *self, PyObject *args)
7817{
7818 PyObject *format_spec;
7819 PyObject *result = NULL;
7820 PyObject *tmp = NULL;
7821
7822 /* If 2.x, convert format_spec to the same type as value */
7823 /* This is to allow things like u''.format('') */
7824 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7825 goto done;
7826 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7827 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007828 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007829 goto done;
7830 }
7831 tmp = PyObject_Unicode(format_spec);
7832 if (tmp == NULL)
7833 goto done;
7834 format_spec = tmp;
7835
7836 result = _PyUnicode_FormatAdvanced(self,
7837 PyUnicode_AS_UNICODE(format_spec),
7838 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007839 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007840 Py_XDECREF(tmp);
7841 return result;
7842}
7843
Eric Smitha9f7d622008-02-17 19:46:49 +00007844PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007845 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007846\n\
Eric Smith6c840852010-11-06 19:43:44 +00007847Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007848
Robert Schuppenies901c9972008-06-10 10:10:31 +00007849static PyObject *
7850unicode__sizeof__(PyUnicodeObject *v)
7851{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007852 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7853 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007854}
7855
7856PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007857 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007858\n\
7859");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007860
7861static PyObject *
7862unicode_getnewargs(PyUnicodeObject *v)
7863{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007864 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007865}
7866
7867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007869 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007870 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7871 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007872 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007873 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7874 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7875 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7876 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7877 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7878 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7879 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007880 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007881 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7882 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7883 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007885 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007886/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7887 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7888 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7889 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007890 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007891 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007892 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007893 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007894 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7895 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7896 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7897 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7898 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7899 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7900 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7901 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7902 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7903 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7904 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7905 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7906 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7907 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007908 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007909 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7910 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7911 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7912 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007913 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007914#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007915 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916#endif
7917
7918#if 0
7919 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007920 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921#endif
7922
Benjamin Peterson857ce152009-01-31 16:29:18 +00007923 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 {NULL, NULL}
7925};
7926
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007927static PyObject *
7928unicode_mod(PyObject *v, PyObject *w)
7929{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007930 if (!PyUnicode_Check(v)) {
7931 Py_INCREF(Py_NotImplemented);
7932 return Py_NotImplemented;
7933 }
7934 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007935}
7936
7937static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007938 0, /*nb_add*/
7939 0, /*nb_subtract*/
7940 0, /*nb_multiply*/
7941 0, /*nb_divide*/
7942 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007943};
7944
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007946 (lenfunc) unicode_length, /* sq_length */
7947 PyUnicode_Concat, /* sq_concat */
7948 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7949 (ssizeargfunc) unicode_getitem, /* sq_item */
7950 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7951 0, /* sq_ass_item */
7952 0, /* sq_ass_slice */
7953 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954};
7955
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007956static PyObject*
7957unicode_subscript(PyUnicodeObject* self, PyObject* item)
7958{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007959 if (PyIndex_Check(item)) {
7960 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007961 if (i == -1 && PyErr_Occurred())
7962 return NULL;
7963 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007964 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007965 return unicode_getitem(self, i);
7966 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007967 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007968 Py_UNICODE* source_buf;
7969 Py_UNICODE* result_buf;
7970 PyObject* result;
7971
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007972 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007973 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007974 return NULL;
7975 }
7976
7977 if (slicelength <= 0) {
7978 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007979 } else if (start == 0 && step == 1 && slicelength == self->length &&
7980 PyUnicode_CheckExact(self)) {
7981 Py_INCREF(self);
7982 return (PyObject *)self;
7983 } else if (step == 1) {
7984 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007985 } else {
7986 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007987 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7988 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007990 if (result_buf == NULL)
7991 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007992
7993 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7994 result_buf[i] = source_buf[cur];
7995 }
Tim Petersced69f82003-09-16 20:30:58 +00007996
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007997 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007998 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 return result;
8000 }
8001 } else {
8002 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8003 return NULL;
8004 }
8005}
8006
8007static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008008 (lenfunc)unicode_length, /* mp_length */
8009 (binaryfunc)unicode_subscript, /* mp_subscript */
8010 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008011};
8012
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008015 Py_ssize_t index,
8016 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017{
8018 if (index != 0) {
8019 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008020 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 return -1;
8022 }
8023 *ptr = (void *) self->str;
8024 return PyUnicode_GET_DATA_SIZE(self);
8025}
8026
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027static Py_ssize_t
8028unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008029 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030{
8031 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008032 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 return -1;
8034}
8035
8036static int
8037unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008038 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
8040 if (lenp)
8041 *lenp = PyUnicode_GET_DATA_SIZE(self);
8042 return 1;
8043}
8044
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008045static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008047 Py_ssize_t index,
8048 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
8050 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008051
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 if (index != 0) {
8053 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008054 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 return -1;
8056 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008057 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008059 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008060 *ptr = (void *) PyString_AS_STRING(str);
8061 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062}
8063
8064/* Helpers for PyUnicode_Format() */
8065
8066static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008067getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008069 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008071 (*p_argidx)++;
8072 if (arglen < 0)
8073 return args;
8074 else
8075 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 }
8077 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008078 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 return NULL;
8080}
8081
8082#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008083#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008085#define F_ALT (1<<3)
8086#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087
Martin v. Löwis18e16552006-02-15 17:27:45 +00008088static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008089strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008091 register Py_ssize_t i;
8092 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008094 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 return len;
8097}
8098
Neal Norwitzfc76d632006-01-10 06:03:13 +00008099static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008100longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8101{
Tim Peters15231542006-02-16 01:08:01 +00008102 Py_ssize_t result;
8103
Neal Norwitzfc76d632006-01-10 06:03:13 +00008104 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008105 result = strtounicode(buffer, (char *)buffer);
8106 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008107}
8108
Guido van Rossum078151d2002-08-11 04:24:12 +00008109/* XXX To save some code duplication, formatfloat/long/int could have been
8110 shared with stringobject.c, converting from 8-bit to Unicode after the
8111 formatting is done. */
8112
Mark Dickinson18cfada2009-11-23 18:46:41 +00008113/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8114
8115static PyObject *
8116formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008118 char *p;
8119 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008121
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 x = PyFloat_AsDouble(v);
8123 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008124 return NULL;
8125
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008127 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008128
Mark Dickinson18cfada2009-11-23 18:46:41 +00008129 p = PyOS_double_to_string(x, type, prec,
8130 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8131 if (p == NULL)
8132 return NULL;
8133 result = PyUnicode_FromStringAndSize(p, strlen(p));
8134 PyMem_Free(p);
8135 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136}
8137
Tim Peters38fd5b62000-09-21 05:43:11 +00008138static PyObject*
8139formatlong(PyObject *val, int flags, int prec, int type)
8140{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008141 char *buf;
8142 int i, len;
8143 PyObject *str; /* temporary string object. */
8144 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008145
Benjamin Peterson857ce152009-01-31 16:29:18 +00008146 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8147 if (!str)
8148 return NULL;
8149 result = _PyUnicode_New(len);
8150 if (!result) {
8151 Py_DECREF(str);
8152 return NULL;
8153 }
8154 for (i = 0; i < len; i++)
8155 result->str[i] = buf[i];
8156 result->str[len] = 0;
8157 Py_DECREF(str);
8158 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008159}
8160
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161static int
8162formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008163 size_t buflen,
8164 int flags,
8165 int prec,
8166 int type,
8167 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008169 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008170 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8171 * + 1 + 1
8172 * = 24
8173 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008174 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008175 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 long x;
8177
8178 x = PyInt_AsLong(v);
8179 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008180 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008181 if (x < 0 && type == 'u') {
8182 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008183 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008184 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8185 sign = "-";
8186 else
8187 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008189 prec = 1;
8190
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008191 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8192 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008193 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008194 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008195 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008196 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 return -1;
8198 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008199
8200 if ((flags & F_ALT) &&
8201 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008202 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008203 * of issues that cause pain:
8204 * - when 0 is being converted, the C standard leaves off
8205 * the '0x' or '0X', which is inconsistent with other
8206 * %#x/%#X conversions and inconsistent with Python's
8207 * hex() function
8208 * - there are platforms that violate the standard and
8209 * convert 0 with the '0x' or '0X'
8210 * (Metrowerks, Compaq Tru64)
8211 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008212 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008213 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008214 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008215 * We can achieve the desired consistency by inserting our
8216 * own '0x' or '0X' prefix, and substituting %x/%X in place
8217 * of %#x/%#X.
8218 *
8219 * Note that this is the same approach as used in
8220 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008221 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008222 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8223 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008224 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008225 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008226 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8227 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008228 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008229 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008230 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008231 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008232 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008233 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
8235
8236static int
8237formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008238 size_t buflen,
8239 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
Ezio Melotti32125152010-02-25 17:36:04 +00008241 PyObject *unistr;
8242 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008244 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008245 if (PyUnicode_GET_SIZE(v) != 1)
8246 goto onError;
8247 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008250 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008251 if (PyString_GET_SIZE(v) != 1)
8252 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008253 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8254 with a UnicodeDecodeError if 'char' is not decodable with the
8255 default encoding (usually ASCII, but it might be something else) */
8256 str = PyString_AS_STRING(v);
8257 if ((unsigned char)str[0] > 0x7F) {
8258 /* the char is not ASCII; try to decode the string using the
8259 default encoding and return -1 to let the UnicodeDecodeError
8260 be raised if the string can't be decoded */
8261 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8262 if (unistr == NULL)
8263 return -1;
8264 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8265 Py_DECREF(unistr);
8266 }
8267 else
8268 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270
8271 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008274 x = PyInt_AsLong(v);
8275 if (x == -1 && PyErr_Occurred())
8276 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008277#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008278 if (x < 0 || x > 0x10ffff) {
8279 PyErr_SetString(PyExc_OverflowError,
8280 "%c arg not in range(0x110000) "
8281 "(wide Python build)");
8282 return -1;
8283 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008284#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008285 if (x < 0 || x > 0xffff) {
8286 PyErr_SetString(PyExc_OverflowError,
8287 "%c arg not in range(0x10000) "
8288 "(narrow Python build)");
8289 return -1;
8290 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008291#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
8294 buf[1] = '\0';
8295 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008296
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008297 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008298 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008299 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008300 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301}
8302
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008303/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8304
Mark Dickinson18cfada2009-11-23 18:46:41 +00008305 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008306 chars are formatted. XXX This is a magic number. Each formatting
8307 routine does bounds checking to ensure no overflow, but a better
8308 solution may be to malloc a buffer of appropriate size for each
8309 format. For now, the current solution is sufficient.
8310*/
8311#define FORMATBUFLEN (size_t)120
8312
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008314 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315{
8316 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008317 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 int args_owned = 0;
8319 PyUnicodeObject *result = NULL;
8320 PyObject *dict = NULL;
8321 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008322
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008324 PyErr_BadInternalCall();
8325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
8327 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008328 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 fmt = PyUnicode_AS_UNICODE(uformat);
8331 fmtcnt = PyUnicode_GET_SIZE(uformat);
8332
8333 reslen = rescnt = fmtcnt + 100;
8334 result = _PyUnicode_New(reslen);
8335 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008336 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 res = PyUnicode_AS_UNICODE(result);
8338
8339 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008340 arglen = PyTuple_Size(args);
8341 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
8343 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008344 arglen = -1;
8345 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
Benjamin Petersonda2c7eb2013-03-23 22:32:00 -05008347 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8348 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008349 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350
8351 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008352 if (*fmt != '%') {
8353 if (--rescnt < 0) {
8354 rescnt = fmtcnt + 100;
8355 reslen += rescnt;
8356 if (_PyUnicode_Resize(&result, reslen) < 0)
8357 goto onError;
8358 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8359 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008360 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008361 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008362 }
8363 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008364 /* Got a format specifier */
8365 int flags = 0;
8366 Py_ssize_t width = -1;
8367 int prec = -1;
8368 Py_UNICODE c = '\0';
8369 Py_UNICODE fill;
8370 int isnumok;
8371 PyObject *v = NULL;
8372 PyObject *temp = NULL;
8373 Py_UNICODE *pbuf;
8374 Py_UNICODE sign;
8375 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008376 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008377
8378 fmt++;
8379 if (*fmt == '(') {
8380 Py_UNICODE *keystart;
8381 Py_ssize_t keylen;
8382 PyObject *key;
8383 int pcount = 1;
8384
8385 if (dict == NULL) {
8386 PyErr_SetString(PyExc_TypeError,
8387 "format requires a mapping");
8388 goto onError;
8389 }
8390 ++fmt;
8391 --fmtcnt;
8392 keystart = fmt;
8393 /* Skip over balanced parentheses */
8394 while (pcount > 0 && --fmtcnt >= 0) {
8395 if (*fmt == ')')
8396 --pcount;
8397 else if (*fmt == '(')
8398 ++pcount;
8399 fmt++;
8400 }
8401 keylen = fmt - keystart - 1;
8402 if (fmtcnt < 0 || pcount > 0) {
8403 PyErr_SetString(PyExc_ValueError,
8404 "incomplete format key");
8405 goto onError;
8406 }
8407#if 0
8408 /* keys are converted to strings using UTF-8 and
8409 then looked up since Python uses strings to hold
8410 variables names etc. in its namespaces and we
8411 wouldn't want to break common idioms. */
8412 key = PyUnicode_EncodeUTF8(keystart,
8413 keylen,
8414 NULL);
8415#else
8416 key = PyUnicode_FromUnicode(keystart, keylen);
8417#endif
8418 if (key == NULL)
8419 goto onError;
8420 if (args_owned) {
8421 Py_DECREF(args);
8422 args_owned = 0;
8423 }
8424 args = PyObject_GetItem(dict, key);
8425 Py_DECREF(key);
8426 if (args == NULL) {
8427 goto onError;
8428 }
8429 args_owned = 1;
8430 arglen = -1;
8431 argidx = -2;
8432 }
8433 while (--fmtcnt >= 0) {
8434 switch (c = *fmt++) {
8435 case '-': flags |= F_LJUST; continue;
8436 case '+': flags |= F_SIGN; continue;
8437 case ' ': flags |= F_BLANK; continue;
8438 case '#': flags |= F_ALT; continue;
8439 case '0': flags |= F_ZERO; continue;
8440 }
8441 break;
8442 }
8443 if (c == '*') {
8444 v = getnextarg(args, arglen, &argidx);
8445 if (v == NULL)
8446 goto onError;
8447 if (!PyInt_Check(v)) {
8448 PyErr_SetString(PyExc_TypeError,
8449 "* wants int");
8450 goto onError;
8451 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008452 width = PyInt_AsSsize_t(v);
8453 if (width == -1 && PyErr_Occurred())
8454 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008455 if (width < 0) {
8456 flags |= F_LJUST;
8457 width = -width;
8458 }
8459 if (--fmtcnt >= 0)
8460 c = *fmt++;
8461 }
8462 else if (c >= '0' && c <= '9') {
8463 width = c - '0';
8464 while (--fmtcnt >= 0) {
8465 c = *fmt++;
8466 if (c < '0' || c > '9')
8467 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008468 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008469 PyErr_SetString(PyExc_ValueError,
8470 "width too big");
8471 goto onError;
8472 }
8473 width = width*10 + (c - '0');
8474 }
8475 }
8476 if (c == '.') {
8477 prec = 0;
8478 if (--fmtcnt >= 0)
8479 c = *fmt++;
8480 if (c == '*') {
8481 v = getnextarg(args, arglen, &argidx);
8482 if (v == NULL)
8483 goto onError;
8484 if (!PyInt_Check(v)) {
8485 PyErr_SetString(PyExc_TypeError,
8486 "* wants int");
8487 goto onError;
8488 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008489 prec = _PyInt_AsInt(v);
8490 if (prec == -1 && PyErr_Occurred())
8491 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008492 if (prec < 0)
8493 prec = 0;
8494 if (--fmtcnt >= 0)
8495 c = *fmt++;
8496 }
8497 else if (c >= '0' && c <= '9') {
8498 prec = c - '0';
8499 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008500 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008501 if (c < '0' || c > '9')
8502 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008503 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008504 PyErr_SetString(PyExc_ValueError,
8505 "prec too big");
8506 goto onError;
8507 }
8508 prec = prec*10 + (c - '0');
8509 }
8510 }
8511 } /* prec */
8512 if (fmtcnt >= 0) {
8513 if (c == 'h' || c == 'l' || c == 'L') {
8514 if (--fmtcnt >= 0)
8515 c = *fmt++;
8516 }
8517 }
8518 if (fmtcnt < 0) {
8519 PyErr_SetString(PyExc_ValueError,
8520 "incomplete format");
8521 goto onError;
8522 }
8523 if (c != '%') {
8524 v = getnextarg(args, arglen, &argidx);
8525 if (v == NULL)
8526 goto onError;
8527 }
8528 sign = 0;
8529 fill = ' ';
8530 switch (c) {
8531
8532 case '%':
8533 pbuf = formatbuf;
8534 /* presume that buffer length is at least 1 */
8535 pbuf[0] = '%';
8536 len = 1;
8537 break;
8538
8539 case 's':
8540 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008541 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008542 temp = v;
8543 Py_INCREF(temp);
8544 }
8545 else {
8546 PyObject *unicode;
8547 if (c == 's')
8548 temp = PyObject_Unicode(v);
8549 else
8550 temp = PyObject_Repr(v);
8551 if (temp == NULL)
8552 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008553 if (PyUnicode_Check(temp))
8554 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008555 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008556 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008557 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8558 PyString_GET_SIZE(temp),
8559 NULL,
8560 "strict");
8561 Py_DECREF(temp);
8562 temp = unicode;
8563 if (temp == NULL)
8564 goto onError;
8565 }
8566 else {
8567 Py_DECREF(temp);
8568 PyErr_SetString(PyExc_TypeError,
8569 "%s argument has non-string str()");
8570 goto onError;
8571 }
8572 }
8573 pbuf = PyUnicode_AS_UNICODE(temp);
8574 len = PyUnicode_GET_SIZE(temp);
8575 if (prec >= 0 && len > prec)
8576 len = prec;
8577 break;
8578
8579 case 'i':
8580 case 'd':
8581 case 'u':
8582 case 'o':
8583 case 'x':
8584 case 'X':
8585 if (c == 'i')
8586 c = 'd';
8587 isnumok = 0;
8588 if (PyNumber_Check(v)) {
8589 PyObject *iobj=NULL;
8590
8591 if (PyInt_Check(v) || (PyLong_Check(v))) {
8592 iobj = v;
8593 Py_INCREF(iobj);
8594 }
8595 else {
8596 iobj = PyNumber_Int(v);
8597 if (iobj==NULL) iobj = PyNumber_Long(v);
8598 }
8599 if (iobj!=NULL) {
8600 if (PyInt_Check(iobj)) {
8601 isnumok = 1;
8602 pbuf = formatbuf;
8603 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8604 flags, prec, c, iobj);
8605 Py_DECREF(iobj);
8606 if (len < 0)
8607 goto onError;
8608 sign = 1;
8609 }
8610 else if (PyLong_Check(iobj)) {
8611 isnumok = 1;
8612 temp = formatlong(iobj, flags, prec, c);
8613 Py_DECREF(iobj);
8614 if (!temp)
8615 goto onError;
8616 pbuf = PyUnicode_AS_UNICODE(temp);
8617 len = PyUnicode_GET_SIZE(temp);
8618 sign = 1;
8619 }
8620 else {
8621 Py_DECREF(iobj);
8622 }
8623 }
8624 }
8625 if (!isnumok) {
8626 PyErr_Format(PyExc_TypeError,
8627 "%%%c format: a number is required, "
8628 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8629 goto onError;
8630 }
8631 if (flags & F_ZERO)
8632 fill = '0';
8633 break;
8634
8635 case 'e':
8636 case 'E':
8637 case 'f':
8638 case 'F':
8639 case 'g':
8640 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008641 temp = formatfloat(v, flags, prec, c);
8642 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008643 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008644 pbuf = PyUnicode_AS_UNICODE(temp);
8645 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008646 sign = 1;
8647 if (flags & F_ZERO)
8648 fill = '0';
8649 break;
8650
8651 case 'c':
8652 pbuf = formatbuf;
8653 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8654 if (len < 0)
8655 goto onError;
8656 break;
8657
8658 default:
8659 PyErr_Format(PyExc_ValueError,
8660 "unsupported format character '%c' (0x%x) "
8661 "at index %zd",
8662 (31<=c && c<=126) ? (char)c : '?',
8663 (int)c,
8664 (Py_ssize_t)(fmt - 1 -
8665 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008666 goto onError;
8667 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008668 if (sign) {
8669 if (*pbuf == '-' || *pbuf == '+') {
8670 sign = *pbuf++;
8671 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008672 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008673 else if (flags & F_SIGN)
8674 sign = '+';
8675 else if (flags & F_BLANK)
8676 sign = ' ';
8677 else
8678 sign = 0;
8679 }
8680 if (width < len)
8681 width = len;
8682 if (rescnt - (sign != 0) < width) {
8683 reslen -= rescnt;
8684 rescnt = width + fmtcnt + 100;
8685 reslen += rescnt;
8686 if (reslen < 0) {
8687 Py_XDECREF(temp);
8688 PyErr_NoMemory();
8689 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008690 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008691 if (_PyUnicode_Resize(&result, reslen) < 0) {
8692 Py_XDECREF(temp);
8693 goto onError;
8694 }
8695 res = PyUnicode_AS_UNICODE(result)
8696 + reslen - rescnt;
8697 }
8698 if (sign) {
8699 if (fill != ' ')
8700 *res++ = sign;
8701 rescnt--;
8702 if (width > len)
8703 width--;
8704 }
8705 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8706 assert(pbuf[0] == '0');
8707 assert(pbuf[1] == c);
8708 if (fill != ' ') {
8709 *res++ = *pbuf++;
8710 *res++ = *pbuf++;
8711 }
8712 rescnt -= 2;
8713 width -= 2;
8714 if (width < 0)
8715 width = 0;
8716 len -= 2;
8717 }
8718 if (width > len && !(flags & F_LJUST)) {
8719 do {
8720 --rescnt;
8721 *res++ = fill;
8722 } while (--width > len);
8723 }
8724 if (fill == ' ') {
8725 if (sign)
8726 *res++ = sign;
8727 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8728 assert(pbuf[0] == '0');
8729 assert(pbuf[1] == c);
8730 *res++ = *pbuf++;
8731 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008732 }
8733 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008734 Py_UNICODE_COPY(res, pbuf, len);
8735 res += len;
8736 rescnt -= len;
8737 while (--width >= len) {
8738 --rescnt;
8739 *res++ = ' ';
8740 }
8741 if (dict && (argidx < arglen) && c != '%') {
8742 PyErr_SetString(PyExc_TypeError,
8743 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008744 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008745 goto onError;
8746 }
8747 Py_XDECREF(temp);
8748 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 } /* until end */
8750 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008751 PyErr_SetString(PyExc_TypeError,
8752 "not all arguments converted during string formatting");
8753 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 }
8755
Thomas Woutersa96affe2006-03-12 00:29:36 +00008756 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008757 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008759 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 }
8761 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 return (PyObject *)result;
8763
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008764 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 Py_XDECREF(result);
8766 Py_DECREF(uformat);
8767 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008768 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 }
8770 return NULL;
8771}
8772
8773static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008774 (readbufferproc) unicode_buffer_getreadbuf,
8775 (writebufferproc) unicode_buffer_getwritebuf,
8776 (segcountproc) unicode_buffer_getsegcount,
8777 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778};
8779
Jeremy Hylton938ace62002-07-17 16:30:39 +00008780static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008781unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8782
Tim Peters6d6c1a32001-08-02 04:15:00 +00008783static PyObject *
8784unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8785{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008786 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008787 static char *kwlist[] = {"string", "encoding", "errors", 0};
8788 char *encoding = NULL;
8789 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008790
Benjamin Peterson857ce152009-01-31 16:29:18 +00008791 if (type != &PyUnicode_Type)
8792 return unicode_subtype_new(type, args, kwds);
8793 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008794 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008795 return NULL;
8796 if (x == NULL)
8797 return (PyObject *)_PyUnicode_New(0);
8798 if (encoding == NULL && errors == NULL)
8799 return PyObject_Unicode(x);
8800 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008801 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008802}
8803
Guido van Rossume023fe02001-08-30 03:12:59 +00008804static PyObject *
8805unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8806{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008807 PyUnicodeObject *tmp, *pnew;
8808 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008809
Benjamin Peterson857ce152009-01-31 16:29:18 +00008810 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8811 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8812 if (tmp == NULL)
8813 return NULL;
8814 assert(PyUnicode_Check(tmp));
8815 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8816 if (pnew == NULL) {
8817 Py_DECREF(tmp);
8818 return NULL;
8819 }
8820 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8821 if (pnew->str == NULL) {
8822 _Py_ForgetReference((PyObject *)pnew);
8823 PyObject_Del(pnew);
8824 Py_DECREF(tmp);
8825 return PyErr_NoMemory();
8826 }
8827 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8828 pnew->length = n;
8829 pnew->hash = tmp->hash;
8830 Py_DECREF(tmp);
8831 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008832}
8833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008834PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008835 "unicode(object='') -> unicode object\n\
8836unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008837\n\
8838Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008839encoding defaults to the current default string encoding.\n\
8840errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008841
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008843 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008844 "unicode", /* tp_name */
8845 sizeof(PyUnicodeObject), /* tp_size */
8846 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008848 (destructor)unicode_dealloc, /* tp_dealloc */
8849 0, /* tp_print */
8850 0, /* tp_getattr */
8851 0, /* tp_setattr */
8852 0, /* tp_compare */
8853 unicode_repr, /* tp_repr */
8854 &unicode_as_number, /* tp_as_number */
8855 &unicode_as_sequence, /* tp_as_sequence */
8856 &unicode_as_mapping, /* tp_as_mapping */
8857 (hashfunc) unicode_hash, /* tp_hash*/
8858 0, /* tp_call*/
8859 (reprfunc) unicode_str, /* tp_str */
8860 PyObject_GenericGetAttr, /* tp_getattro */
8861 0, /* tp_setattro */
8862 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008863 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008864 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008865 unicode_doc, /* tp_doc */
8866 0, /* tp_traverse */
8867 0, /* tp_clear */
8868 PyUnicode_RichCompare, /* tp_richcompare */
8869 0, /* tp_weaklistoffset */
8870 0, /* tp_iter */
8871 0, /* tp_iternext */
8872 unicode_methods, /* tp_methods */
8873 0, /* tp_members */
8874 0, /* tp_getset */
8875 &PyBaseString_Type, /* tp_base */
8876 0, /* tp_dict */
8877 0, /* tp_descr_get */
8878 0, /* tp_descr_set */
8879 0, /* tp_dictoffset */
8880 0, /* tp_init */
8881 0, /* tp_alloc */
8882 unicode_new, /* tp_new */
8883 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884};
8885
8886/* Initialize the Unicode implementation */
8887
Thomas Wouters78890102000-07-22 19:25:51 +00008888void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008890 /* XXX - move this array to unicodectype.c ? */
8891 Py_UNICODE linebreak[] = {
8892 0x000A, /* LINE FEED */
8893 0x000D, /* CARRIAGE RETURN */
8894 0x001C, /* FILE SEPARATOR */
8895 0x001D, /* GROUP SEPARATOR */
8896 0x001E, /* RECORD SEPARATOR */
8897 0x0085, /* NEXT LINE */
8898 0x2028, /* LINE SEPARATOR */
8899 0x2029, /* PARAGRAPH SEPARATOR */
8900 };
8901
Fred Drakee4315f52000-05-09 19:53:39 +00008902 /* Init the implementation */
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008903 if (!unicode_empty) {
8904 unicode_empty = _PyUnicode_New(0);
8905 if (!unicode_empty)
8906 return;
8907 }
Neal Norwitze1fdb322006-07-21 05:32:28 +00008908
Guido van Rossumcacfc072002-05-24 19:01:59 +00008909 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008910 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008911
8912 /* initialize the linebreak bloom filter */
8913 bloom_linebreak = make_bloom_mask(
8914 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8915 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008916
8917 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008918
8919 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8920 Py_FatalError("Can't initialize field name iterator type");
8921
8922 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8923 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924}
8925
8926/* Finalize the Unicode implementation */
8927
Christian Heimes3b718a72008-02-14 12:47:33 +00008928int
8929PyUnicode_ClearFreeList(void)
8930{
8931 int freelist_size = numfree;
8932 PyUnicodeObject *u;
8933
8934 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008935 PyUnicodeObject *v = u;
8936 u = *(PyUnicodeObject **)u;
8937 if (v->str)
8938 PyObject_DEL(v->str);
8939 Py_XDECREF(v->defenc);
8940 PyObject_Del(v);
8941 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008942 }
8943 free_list = NULL;
8944 assert(numfree == 0);
8945 return freelist_size;
8946}
8947
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948void
Thomas Wouters78890102000-07-22 19:25:51 +00008949_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008951 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008953 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008954
Serhiy Storchakac59c85c2013-01-26 12:13:40 +02008955 for (i = 0; i < 256; i++)
8956 Py_CLEAR(unicode_latin1[i]);
8957
Christian Heimes3b718a72008-02-14 12:47:33 +00008958 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008960
Anthony Baxterac6bd462006-04-13 02:06:09 +00008961#ifdef __cplusplus
8962}
8963#endif